from sklearn.feature_extraction.text import TfidfVectorizer
from pathlib import Path

DATA_PATH = Path('.') / 'AmazonCat13K'
RAW_TEXT_FILE = 'AmazonCat-13K_raw_texts_{}.txt'
TFIDF_FILE = 'AmazonCat-13K_tfidf_{}_{}.svm'
VOCABULARY_FILE = Path('.') / 'vocab.txt'

raw_text = {'test' : list(), 'train' : list()}
label = {'test' : list(), 'train' : list()}
vocabulary = list()

# get vocabulary
with open(VOCABULARY_FILE, 'r') as voc:
    vocabulary = [x[:-1] for x in voc.readlines()]

# get raw text
for partition in ['test','train']:
    RAW_TEXT_PATH = DATA_PATH / RAW_TEXT_FILE.format(partition)
    with open(str(RAW_TEXT_PATH.resolve()), 'r', encoding='utf-8') as file:
        label[partition],raw_text[partition] = zip(*map(lambda x:x.split('\t'), file.readlines()))

# set TfidfVectorizer
vectorize_ver1 = TfidfVectorizer(vocabulary=vocabulary, norm='l2', stop_words = 'english', strip_accents = 'unicode')
vectorize_ver1.fit(raw_text['train'])
vectorize_ver2 = TfidfVectorizer()
vectorize_ver2.fit(raw_text['train'])

# compute tfidf
for partition in ['test','train']:
    TFIDF_PATH_1 = DATA_PATH / TFIDF_FILE.format(partition, 'ver1')
    TFIDF_PATH_2 = DATA_PATH / TFIDF_FILE.format(partition, 'ver2')
    out_file_1 = open(str(TFIDF_PATH_1.resolve()), 'w')
    out_file_2 = open(str(TFIDF_PATH_2.resolve()), 'w')
    X1 = vectorize_ver1.transform(raw_text[partition])
    X2 = vectorize_ver2.transform(raw_text[partition])
    for labels,feature,index in zip(label[partition], X1.tolil().data, X1.tolil().rows):
        label_str = labels.replace(' ', ',')
        feature_str = ' '.join([f'{x+1}:{y}' for x,y in zip(index,feature)])
        out_file_1.write(label_str + ' ' + feature_str + '\n')
    for labels,feature,index in zip(label[partition], X2.tolil().data, X2.tolil().rows):
        label_str = labels.replace(' ', ',')
        feature_str = ' '.join([f'{x+1}:{y}' for x,y in zip(index,feature)])
        out_file_2.write(label_str + ' ' + feature_str + '\n')
    out_file_1.close()
    out_file_2.close()
