from sklearn.feature_extraction.text import TfidfVectorizer

VOCAB_FILE = 'data/vocab.txt'
# These paths are formatted with 'train' or 'test'
INPUT = "data/wiki10_31k_raw_texts_{}.txt"
BOW_OUTPUT = 'data/wiki10_31k_tfidf_{}.svm'

# read raw texts and labels
raw_texts = {}
labels = {}
for partition in ['train', 'test']:
    with open(INPUT.format(partition), 'r') as input_f:
        lines = input_f.readlines()
        labels_list, raw_texts_list = zip(*map(lambda line: line.split("\t", 1), lines))
    labels[partition] = labels_list
    raw_texts[partition] = raw_texts_list


print(f"Fitting vectorizer...")
vectorizer = TfidfVectorizer(min_df=3, norm="l2", use_idf=True, smooth_idf=True, sublinear_tf=False)
vectorizer.fit(raw_texts['train'])


for partition in ['train', 'test']:
    bow_file = BOW_OUTPUT.format(partition)
    print(f"Generating {bow_file}...")
    
    bow_output = open(bow_file, 'w')

    vectors = vectorizer.transform(raw_texts[partition]).tolil()
    
    for labels_str, feature, index in zip(labels[partition], vectors.data, vectors.rows):
        assert len(feature) == len(index)
        labels_str = ",".join(labels_str.split(' '))

        # feature index starts from 1
        feature_str = ' '.join([f'{i+1}:{f}' for f, i in zip(feature, index)])
        bow_output.write(f'{labels_str} {feature_str}\n')
            
    bow_output.close()

# output vocabulary
print(f"Generating {VOCAB_FILE}...")
vocab = [(idx, token) for (token, idx) in vectorizer.vocabulary_.items()]
vocab.sort(key=lambda x: x[0])
lines = [token + '\n' for idx, token in vocab]

with open(VOCAB_FILE, 'w') as f:
    f.writelines(lines)
