"""Process the MIMIC-50 data from CAML and put them to the data directory.
(1) Convert processed MIMIC3 data (e.g., train_50.csv, test_50.csv, and dev_50.csv) to
the format we use in our codebase.
(2) Copy processed_full.embed and vocab.csv to the data directory.
"""

import os
import pandas as pd
from pathlib import Path
import shutil


HOME_DIR = str(Path.home())
CAML_MIMIC3_DIR=f'{HOME_DIR}/caml-mimic/mimicdata/mimic3'
DATA_DIR='./LibMultiLabel/data/MIMIC-50'


os.makedirs(DATA_DIR, exist_ok=True)

for split in ['train', 'test', 'dev']:
    data_path = os.path.join(CAML_MIMIC3_DIR, f'{split}_50.csv')
    output_path = os.path.join(DATA_DIR, f'{split}.txt' if split != 'dev' else 'valid.txt')
    df = pd.read_csv(data_path)
    df['LABELS'] = df['LABELS'].str.replace(';', ' ')
    df[['LABELS', 'TEXT']].to_csv(output_path, sep='\t', header=False)
    print(f'Output {len(df)} {split} samples to {output_path}.')

print(f'Copying {CAML_MIMIC3_DIR}/processed_full.embed and {CAML_MIMIC3_DIR}/vocab.csv to {DATA_DIR}.')
try:
    shutil.copy(os.path.join(CAML_MIMIC3_DIR, 'processed_full.embed'), os.path.join(DATA_DIR, 'processed_full.embed'))
    shutil.copy(os.path.join(CAML_MIMIC3_DIR, 'vocab.csv'), os.path.join(DATA_DIR, 'vocab.csv'))
except:
    print(f'Failed to copy the embed file and the vocab file. Make sure you have processed_full.embed and vocab.csv in {CAML_MIMIC3_DIR}.')
