import json,os,requests,gdown,zipfile,gzip
from bs4 import BeautifulSoup
from pathlib import Path
from sklearn.preprocessing import normalize
import numpy as np

DATA_PATH = Path('.') / 'Amazon-670K'
FEATURE_PATH = DATA_PATH / 'Amazon670K.bow'
FEATURE_FILE = 'Amazon-670K_tfidf_{}_ver2.svm'

# get urls for feature
response = requests.get('http://manikvarma.org/downloads/XC/XMLRepository.html#Julian13')
soup = BeautifulSoup(response.text, 'html.parser')
item = soup.find_all('td', align='right')
web_text = ''
for i in item:
    if i.getText() == 'Amazon-670K':
        next = i.find_next_siblings('td')[0].select('a')
        web_text = next[0].get('href')

# download file and extract from zip
if not DATA_PATH.exists():
    os.mkdir(DATA_PATH)
re_text = requests.get(web_text)
su_text = BeautifulSoup(re_text.text, 'html.parser')
url_text = ''
for i in su_text.find_all('script')[1].getText().split(','):
    if 'download' in i:
        url_text = json.loads(i)
        break
out_file_text = str(DATA_PATH.resolve() / 'feature.zip')
if not os.path.isfile(out_file_text):
    gdown.download(url=url_text, output=out_file_text)
with zipfile.ZipFile(open(out_file_text, 'rb')) as f:
    f.extractall(path=str(DATA_PATH.resolve()))

# transform feature into svm format and apply l2-normalize
for partition in ['train', 'test']:
    out_file = open(str((DATA_PATH / FEATURE_FILE.format(partition)).resolve()), 'w', encoding='utf-8')
    with open(str((FEATURE_PATH / f'{partition}.txt').resolve()), 'r') as f:
        skip_first = True
        for line in f:
            if skip_first:
                skip_first = False
                continue
            line = line.split()
            labels = line[0]
            features = line[1:]
            idx_list = []
            val_list = []
            for feature in features:
                idx, val = feature.split(':')
                idx_list.append(int(idx) + 1)
                val_list.append(float(val))
            val_list = np.array(val_list)[np.newaxis, :]
            val_list = normalize(val_list)[0]
            new_features = ' '.join([f'{idx}:{val}' for idx, val in zip(idx_list, val_list)])
            out_file.write(labels + ' ' + new_features + '\n')
    out_file.close()
