import json,os,requests,gdown,zipfile,gzip
from bs4 import BeautifulSoup
from pathlib import Path

DATA_PATH = Path('.') / 'Amazon-670K'
RAW_PATH = DATA_PATH / 'Amazon-670K.raw'
RAW_TEXT_FILE = 'Amazon-670K_raw_texts_{}.txt'


# get urls for raw text
response = requests.get('http://manikvarma.org/downloads/XC/XMLRepository.html#Julian13')
soup = BeautifulSoup(response.text, 'html.parser')
item = soup.find_all('td', align='right')
web_text = ''
for i in item:
    if i.getText() == 'Amazon-670K':
        next = i.find_next_siblings('td')[0].select('a')
        web_text = next[1].get('href')

# download file and extract from zip
if not DATA_PATH.exists():
    os.mkdir(DATA_PATH)
re_text = requests.get(web_text)
su_text = BeautifulSoup(re_text.text, 'html.parser')
url_text = ''
for i in su_text.find_all('script')[1].getText().split(','):
    if 'download' in i:
        print(i)
        url_text = json.loads(i)
        break
out_file_text = str(DATA_PATH.resolve() / 'text.zip')
if not os.path.isfile(out_file_text):
    gdown.download(url=url_text, output=out_file_text)
with zipfile.ZipFile(open(out_file_text, 'rb')) as f:
    f.extractall(path=str(DATA_PATH.resolve()))

# extract raw text from json.gz
raw_text = {'test' : list(), 'train' : list()}
for file,partition in zip(['tst.raw.json.gz','trn.raw.json.gz'], ['test','train']):
    RAW_FILE = RAW_PATH / file
    RAW_TEXT_PATH = DATA_PATH / RAW_TEXT_FILE.format(partition)
    out_file = open(str(RAW_TEXT_PATH.resolve()), 'w', encoding='utf-8')
    with gzip.open(str(RAW_FILE.resolve()), 'rt') as f:
        for x in f:
            instance = json.loads(x)
            text = ' '.join([str(x) for x in instance['target_ind']]) + '\t' + ' '.join(instance['title'][:-1].split()) + ' ' + ' '.join(instance['content'].split()) + '\n'
            out_file.write(text)
    out_file.close()
