import json,os,requests,gdown,zipfile,gzip
from bs4 import BeautifulSoup
from pathlib import Path

DATA_PATH = Path('.') / 'AmazonCat13K'
RAW_PATH = DATA_PATH / 'AmazonCat-13K.raw'
RAW_TEXT_FILE = 'AmazonCat-13K_raw_texts_{}.txt'


# get urls for raw text
response = requests.get('http://manikvarma.org/downloads/XC/XMLRepository.html#Julian13')
soup = BeautifulSoup(response.text, 'html.parser')
item = soup.find_all('td', align='right')
web_text = ''
for i in item:
    if i.getText() == 'AmazonCat-13K':
        next = i.find_next_siblings('td')[0].select('a')
        web_text = next[1].get('href')

# download file and extract from zip
if not DATA_PATH.exists():
    os.mkdir(DATA_PATH)
re_text = requests.get(web_text)
su_text = BeautifulSoup(re_text.text, 'html.parser')
url_text = ''
for i in su_text.find_all('script')[1].getText().split(','):
    if 'download' in i:
        print(i)
        url_text = json.loads(i)
        break  
out_file_text = str(DATA_PATH.resolve() / 'text.zip')
if not os.path.isfile(out_file_text):
    gdown.download(url=url_text, output=out_file_text)
with zipfile.ZipFile(open(out_file_text, 'rb')) as f:
    f.extractall(path=str(DATA_PATH.resolve()))

# extract raw text from json.gz
raw_text = {'test' : list(), 'train' : list()}
for file,partition in zip(['tst.json.gz','trn.json.gz'],['test','train']):
    RAW_FILE = RAW_PATH / file
    RAW_TEXT_PATH = DATA_PATH / RAW_TEXT_FILE.format(partition)
    out_file = open(str(RAW_TEXT_PATH.resolve()), 'w', encoding='utf-8')
    with gzip.open(str(RAW_FILE.resolve()), 'rt') as f:
        for x in f:
            instance = json.loads(x)
            text = ' '.join([str(x) for x in instance['target_ind']]) + '\t' + ' '.join(instance['title'][:-1].split()) + ' ' + ' '.join(instance['content'].split()) + '\n'
            if instance['uid'] == '0070134561': #add missed raw text
                text = ' '.join([str(x) for x in instance['target_ind']]) + '\t' + ' '.join(instance['title'][:-1].split()) + ' ' + ' '.join('Small plane navigation made easy-Learn the basics, in plain english If you need a practical-and basic-introduction to navigation, Light Airplane Navigation Essentials is for you. This book is designed for new small aircraft pilots and students just entering flight training who want a guide that will clearly teach the fundamentals they need. Light Airplane Navigation Essentials takes the mysteries out of navigation and clarifies techniques and technologies, such as pilotage and dead reckoning... course plotting... radio navigation... VOR, Loran, and GPS... airspace rules... even the glass cockpit. Paul Craig has taught a very successful navigation course for several years, and knows exactly what the new pilot needs to learn. He explains the theory and practice of navigation in a straightforward conversational style, making generous use of illustrations and graphics. Each chapter builds directly on the one before, going from the most basic principles to the latest electronic devices. The book has several handy features, including: Questions and sample navigation problems to help you test your knowledge included at the end of each chapter; Large, clear illustrations of key concepts; Chapters on future types of navigation, including the "glass cockpit"; A section on the latest airspace designations. Read this book, work the problems, and take your new skills straight into the cockpit. Fly with the confidence that you thoroughly understand the principles of navigation. You won\'t find a better hands-on guide to the basics. ABOUT THE AUTHOR Paul A. Craig, an experienced pilot, is the Chief Flight Instructor at Middle Tennessee State University in Murfreesboro, Tennessee. He holds eleven FAA Flight Certificates, including Airline Transport Pilot, Gold Seal Multiengine, and Instrument Flight Instructor. He has a master\'s degree in Aerospace Education. Craig is also the author of Multiengine Flying, Second Edition, Be a Better Pilot, and Stalls and Spins (McGraw-Hill).	Paul A. Craig, Ed.D., longtime pilot, FAA award-winning flight instructor, and aviation educator and author, designed and conducted the research described in this book based on his lifelong concern with the high accident rate among general aviation pilots, and in the process of earning his doctorate in education, with special empahsis on pilot decision-making and flight training.     A Gold Seal Multiengine Flight Instructor and twice FAA District Flight Instructor of the Year, he has spoken widely to flight instructors and others on improving flight training and safety. He is the author of Be a Better Pilot; Stalls & Spins; Multiengine Flying, 2nd Edition; and Light Airplane Navigation Essentials, all from McGraw-Hill\'s renowned Practical Flying Series.'.split())+'\n'
            out_file.write(text)
    out_file.close()
