#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define MAX 10000

int main(int argc, char *argv[])
{
	FILE *fp1;
	FILE *fp2;
	FILE *test_fp;
	FILE *testlist_fp;
	FILE *test_outfp;
	FILE *train_outfp;
	FILE *dssp_fp;
	char str[MAX] = {0};
	char str2[MAX] = {0};
	char str3[MAX] = {0};
	char str4[MAX] = {0};
	char str5[MAX] = {0};
	char test_name[100] = {0};
	char dssp_name[100] = {0};
	char tmp[MAX] = {0};
	char remain[] = {'H', 'G', 'I', 'B', 'E'};
	char AA[MAX] = {0};
	char SSE[MAX] = {0};
	char *pch;
	char *t_pch;
	char PID[20] = {0};
	char class[100] = {0};
	char pre_fold[50] = {0};
	char cur_fold[50]= {0};
	int hasprinted = 0;
	int index = 0, trash = 0;
	int length = 0;
	int i, j, p, q , num, invalid, t, k, isSSE, loophead;
	int mode, start, end, start2, end2;
	int THRESHOLD;

	if(argc == 1){
		fprintf(stderr, "Usage ./split [source_file] [train_file] [Threshold]\n");
		fprintf(stderr, "Will create [test.list] [test.out] [train.out] files\n");
		return 0;
	}

	THRESHOLD = atoi(argv[3]);
		
	if( (fp1 = fopen(argv[1], "r")) == NULL){
		fprintf(stderr, "file %s open error\n", argv[1]);
		return 0;
	}

	if( (fp2 = fopen(argv[2], "w")) == NULL){
		fprintf(stderr, "file %s open error\n", argv[2]);
		return 0;
	}

	if( (testlist_fp = fopen("test.list", "w")) == NULL){
		fprintf(stderr, "file test.list open error\n");
		return 0;
	}
	if( (test_outfp = fopen("test.out", "w")) == NULL){
		fprintf(stderr, "file test.out open error\n");
		return 0;
	}

	if( (train_outfp = fopen("train.out", "w")) == NULL){
		fprintf(stderr, "file train.out open error\n");
		return 0;
	}
	
	while(fgets(str, MAX, fp1) != NULL){
		memset(str2, 0, MAX);
		memset(str3, 0, MAX);
		memset(str4, 0, MAX);

		pch = strtok(str, "\n\r");
		strcpy(str2, pch);
		if(str[0] == '>'){
			pch = strtok(str, " \t");		// PID, ex: >ld1aej
			strncpy(PID, pch+1, 7);
			pch = strtok(NULL, " \t"); 	// SCOP class ex: a.1.1.1
			strcpy(class, pch);
			pch = strtok(NULL, " \t");			

			invalid = 0;
			if(strchr(PID, '.') != NULL) {
				fprintf(stderr, "%s discarded\n", PID);
				invalid = 1;
			}

			pch = strtok(class, ".");
			strcpy(cur_fold, pch);
			pch = strtok(NULL, ".");
			strcat(cur_fold, pch);
			if( (strcmp(cur_fold, pre_fold)==0) && hasprinted == 0) {

				// 製作 query sequence file
				strcpy(test_name, PID);
				strcat(test_name, ".test");
				//fprintf(stderr, "%s\n", test_name);
				if(invalid != 1) {
					if( (test_fp = fopen( test_name, "w")) == NULL){
						fprintf(stderr, "file %s open error\n", test_name);
						return 0;
					}
					fprintf(testlist_fp, "%s\n", PID);
					fprintf(test_outfp, "%s\n", str2);
					fprintf(test_fp, "%s\n", str2);
				}
			}	 
			else if(strcmp(cur_fold, pre_fold) != 0) {
				fprintf(train_outfp, "%s\n", str2);				
				hasprinted = 0;
				if(invalid != 1)
					fprintf(fp2, "%s\n", str2);
			}
			else {
				fprintf(train_outfp, "%s\n", str2);				
				if(invalid != 1)
					fprintf(fp2, "%s\n", str2);
			}
		}
		else{
			if(invalid == 1)
				continue;
			memset(dssp_name, 0, 100);

			//於此處加入移除Loop的演算法
			strcpy(dssp_name, PID);
			strcat(dssp_name, ".ent.dssp");
			if(( dssp_fp = fopen(dssp_name, "r")) == NULL){				
				fprintf(stderr, "File %s open error\n", dssp_name);
				return 0;
			}			
			i = j = p = q = k = t = 0;
			memset(AA, 0, MAX);
			memset(SSE, 0, MAX);
			memset(tmp, 0, MAX);
			start = end = 0;
			num = strlen(remain);
			start = -MAX;
			loophead = 0;			
			while(fgets(str4, MAX, dssp_fp)!= NULL) {  // get reading  dssp file
				// ignore header
				if(i<25){
					i++;
					continue;
				}
				else{
					AA[p] = str4[13];  		 	 // AA
					SSE[p] = str4[16];  		  	 // SSE
					isSSE = 0;
					for( q=0; q < num; q++){
						if(str4[16] == remain[q]){	// helix or beta
							isSSE = 1;
							if( p - start <= THRESHOLD && loophead == 1){
								for(t = start; t < p; t++)
									tmp[k++] = AA[t];		// copy AA to tmp
							}
							loophead = 0;
							tmp[k++] = AA[p];		// copy AA to tmp
							break;
						}
					}
					if(isSSE != 1 && loophead == 0){
						start = p;
						loophead = 1;
					}
					p++;
				}			
			}	
			// There is chance that the ending part of sequence are loops.
			if(p - start  <= THRESHOLD)
				for( t = start; t < p; t++)
					tmp[k++] = AA[t];			
			fclose(dssp_fp);
			//結束演算法				
			if((strcmp(cur_fold, pre_fold)==0) && hasprinted == 0) {
				fprintf(test_fp, "%s\n", tmp);
				hasprinted = 1;
				fprintf(test_outfp, "%s\n", AA);				
				fprintf(test_outfp, "%s\n", SSE);
				fprintf(test_outfp, "%s %d\n", tmp, start);
				fclose(test_fp);	// close output stream
			}			
			else{
				fprintf(fp2, "%s\n", tmp);
				fprintf(train_outfp, "%s\n", AA);				
				fprintf(train_outfp, "%s\n", SSE);
				fprintf(train_outfp, "%s %d\n", tmp, start);
			}
			strcpy(pre_fold, cur_fold);
		}
	}
	fclose(test_outfp);
	fclose(train_outfp);
	return 1;	
}

