
package iitb.cfilt.cpost.crfpp;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.*;

import iitb.cfilt.cpost.*;
import iitb.cfilt.cpost.newstemmer.*;
import iitb.cfilt.cpost.crf.DataSequenceImpl;
import iitb.cfilt.cpost.crf.MorphToken;
import iitb.cfilt.cpost.ma.*;
import iitb.cfilt.cpost.test.TestVGI;
import iitb.cfilt.cpost.vgi.*;

public class FileFormer
{
	private static Stemmer stemmer;
	private static MorphologicalAnalyzer ma;
	private static VerbGroupIdentifier vgi;
	private static MorphologicalAnalyzerRuleReader MAR;
	//private static TestVGI tvg;
	private static VerbGroup7 vg;
	
	
	public static Vector<String> categories;
	public static Vector<String> tagcats;
	public static Vector<String> distinctTags;
	
	private static boolean NNFlag = true; // It is true when NN-NNP distinction is not required
	private static boolean VGI = true; // It is true when VGI is to be applied. By Nikhilesh 
	/**
	 * This function is used to create the training file required for the CRF++ package.
	 * @param inFile This is the input file with tagged data
	 * @param outFile This file is created for using as a training file for CRF++. It contains multiple columns for creating features.
	 * @param flag This is to set NNFlag (see above) // Nikhilesh //
	 * @param flagVGI This is to set VGIFlag (see above) // Nikhilesh //
	 * 
	 */
		
	public static void CreateFile(String inFile, String outFile, String flag,String flagVGI)//,String resultfile)
	{
		try
		{
			/* Nikhilesh  //  start*/
			if(flag.equalsIgnoreCase("true"))
			{
				NNFlag = true;
			}
			else if(flag.equalsIgnoreCase("false"))
			{
				NNFlag = false;
			}
			if(flagVGI.equalsIgnoreCase("true"))
			{
				VGI = true;
			}
			else if(flagVGI.equalsIgnoreCase("false"))
			{
				VGI = false;
			}
			/* Nikhilesh  //  end*/
			String tagFile = ConfigReader.get("CRF.tagfile");
			
			distinctTags = new Vector<String>();
			/***********Temporary code *****************/
			/*File outfile1 = new File(resultfile);
			if(outfile1.exists())
			{
				outfile1.delete();
				outfile1 = new File(resultfile);
			}
			UTFWriter ob1 = new UTFWriter(outfile1);*/
			/***********Temporary code *****************/
			BufferedReader bf = new BufferedReader(new InputStreamReader(new FileInputStream(tagFile), "UTF8"));
			String line = "";
			while((line = bf.readLine()) != null)
			{
//				 ** Remove multiple spaces /*String[]
				String[] vals = line.split(" ");
				StringBuilder sb = new StringBuilder();
				for (String s : vals) { 
					if (s.trim().length() > 0) {
						sb.append(s).append(" "); 
					} 
				} 
				String new_line = sb.toString().trim();
				line = new_line;
				distinctTags.add(line.trim());
			}
			bf.close();
			
			categories = new Vector<String>();
			tagcats = new Vector<String>();
						
			String tagCatMapFile = ConfigReader.get("HMM.tagCat");
			bf = new BufferedReader(new InputStreamReader(new FileInputStream(tagCatMapFile), "UTF8"));
			line = "";
			while((line = bf.readLine()) != null)
			{
				String[] temp = line.split("-");
				categories.add(temp[1].trim());
				tagcats.add(temp[0].trim());
			}
			bf.close();
			
			File outfile = new File(outFile);
			if(outfile.exists())
			{
				outfile.delete();
				outfile = new File(outFile);
			}
			UTFWriter ob = new UTFWriter(outfile);
			
			
			bf = new BufferedReader(new InputStreamReader(new FileInputStream(inFile), "UTF8"));
			line = "";
			int XCcnt=0;
			while((line = bf.readLine()) != null)
			{
				if(line.length() != 0)
				{
					//int [] vga;
					//vga = vg.doVGI_forSentence(line, true);
					Vector<String> words = new Vector<String>();
					Vector<String> labels = new Vector<String>();
					String[] taggedWords = line.split("\\]");
					
					Vector<MorphologicallyAnalyzedToken> matv = new Vector<MorphologicallyAnalyzedToken>();
					Vector<StemmedToken> stv = new Vector<StemmedToken>();
					String token;
					
					for(int i=0;i<taggedWords.length;i++)
					{
						if(taggedWords[i].contains("_"))
						{
							//System.out.println("word.."+taggedWords[i]+"..done");
							String[] taggedWordComponents = taggedWords[i].split("_\\[");
							if(taggedWordComponents[0].trim()!="")
							 token = taggedWordComponents[0].trim();
							else token=".";
							String temptag = taggedWordComponents[1].trim();
							/***********Temporary code *****************/
							/*if(temptag.equals("NN")){
								ob1.writeUTF(token + " ");
							}*/
							/***********Temporary code *****************/
							if(NNFlag) // It is true when NN-NNP distinction is not required
							{
								if(temptag.equals("NNC") || temptag.equals("NNP") || temptag.equals("NNPC") || temptag.equals("XC"))
									temptag = "NN";
								else if(temptag.equals("JJC"))
									temptag = "JJ";
							}
							else
							{
								if(temptag.equals("NNPC"))
									temptag = "NNP";
								else if(temptag.equals("NNC"))
									temptag = "NN";
								else if(temptag.equals("JJC"))
									temptag = "JJ";
								/*else if(temptag.equals("XC"))
									XCcnt++;
								else if(temptag.equals("NN"))
									for(int backCnt=XCcnt; backCnt>0; backCnt--){
										
									}*/
							}
							
							//Each word is stemmed first.
							
							StemmedToken st = stemmer.stem(token);
							stv.add(st);
							
							//Following loop puts ultimate deletion obtained after stemming in the suffix list.
							
							Iterator<StemmerRuleResult> iterResult = st.getStemmedOutputs().iterator();
							while(iterResult.hasNext())
							{
								StemmerRuleResult temp = (StemmerRuleResult)iterResult.next();
								
								if(temp.getSuffixList().indexOf(temp.getUltimateDeletion()) == -1)
									temp.getSuffixList().add(temp.getUltimateDeletion());
							}
							
							//Getting MA output for the word.
							
							MorphologicallyAnalyzedToken mat = ma.analyze(st,MAR);
							//MorphologicallyAnalyzedToken vgit = vgi.identifyVerbGroups(mat);
							
							matv.add(mat);
							
							Vector<StemmerRuleResult> strv = st.getStemmedOutputs();
							Vector<MorphologicalAnalyzerRuleResult> marv = mat.getMorphologicalAnalyzerOutputs();
							HashSet<String> cats = new HashSet<String>();
							
							for(int z=0;z<strv.size();z++)
							{
								cats.add(strv.get(z).getCategory().trim());
							}
							
							words.add(token);
							labels.add(temptag);
						}
					}
					
					Vector<MorphologicallyAnalyzedToken> vgitv;
					int [] vga;
					//if(VGI){
						//vgitv = vgi.identifyRegexVGs(matv);
					//}
					//else{
						vgitv = matv;
					//}
					vga = vg.identifyVerbGroups1(matv);   // Nikhilesh 
					// vga will contain an integer corresponding to each token. where 0 for non verb, 1 for main verb and >1 for verb aux.
					
					//Vector<MorphologicallyAnalyzedToken> vgitv = matv;
					for(int i=0;i<vgitv.size();i++)
					{
						Vector<String> Categories = new Vector<String>();
						
						for(int j=0;j<vgitv.get(i).getMorphologicalAnalyzerOutputs().size();j++)
						{
							if(!Categories.contains(vgitv.get(i).getMorphologicalAnalyzerOutputs().get(j).getCategory().trim()))
								Categories.add(vgitv.get(i).getMorphologicalAnalyzerOutputs().get(j).getCategory().trim());
						}
						
						//Getting the ambiguity scheme.
						
						Collections.sort(Categories);
						String AS = "";
						
						Vector<Integer> catFlags = new Vector<Integer>();
						for(int j=0;j<categories.size();j++)
							catFlags.add(0);
						if(VGI && vga[i]>0){
							if(vga[i]>1)
								AS = "verb_aux";							
							else
								AS = "verb";
						}
						else{
							for(int j=0;j<Categories.size();j++)
							{
								AS = AS + Categories.get(j) + "*";
								catFlags.set(categories.indexOf(Categories.get(j)),1);
							}
							int l = AS.length();
							AS = AS.substring(0,l-1);
						}
						String cats = "";
						
						for(int j=0;j<categories.size();j++)
							cats = cats + Integer.toString(catFlags.get(j)) + "\t";
						
						String suffix = StemmerRuleReader.getSuffix(words.get(i));
						if(suffix == null)
							suffix = "NoSuff";
						
						//*********Suffixes & Prefixes of length 1 to 4
						
						String curr_word = words.get(i);
						int len = curr_word.length();
						String s1="No",s2="No",s3="No",s4="No";
						String p1="No",p2="No",p3="No",p4="No";
						if(len >= 4)
						{
							s1 = curr_word.substring(len-1);
							s2 = curr_word.substring(len-2);
							s3 = curr_word.substring(len-3);
							s4 = curr_word.substring(len-4);
							
							p1 = curr_word.substring(0,1);
							p2 = curr_word.substring(0,2);
							p3 = curr_word.substring(0,3);
							p4 = curr_word.substring(0,4);
							
						}
						else if(len >= 3)
						{
							s1 = curr_word.substring(len-1);
							s2 = curr_word.substring(len-2);
							s3 = curr_word.substring(len-3);
							
							p1 = curr_word.substring(0,1);
							p2 = curr_word.substring(0,2);
							p3 = curr_word.substring(0,3);
						}
						else if(len >= 2)
						{
							s1 = curr_word.substring(len-1);
							s2 = curr_word.substring(len-2);
							
							p1 = curr_word.substring(0,1);
							p2 = curr_word.substring(0,2);
						}
						else if(len >= 1)
						{
							s1 = curr_word.substring(len-1);
							
							p1 = curr_word.substring(0,1);
						}
						
						String combined_suffixes = s1 + "\t" + s2 + "\t" + s3 + "\t" + s4 + "\t";
						
						String combined_prefixes = p1 + "\t" + p2 + "\t" + p3 + "\t" + p4 + "\t";
						
						//**********************************
						//Writing this tab seperated line as one training instance in the output file.
						ob.writeUTF(words.get(i) + "\t" + AS + "\t" + cats + suffix + "\t" + combined_suffixes + Integer.toString(len) + "\t" + combined_prefixes + labels.get(i) + "\n");
						
					}
					ob.writeUTF("\n");
				}
			}
			
			System.out.println("Done!");
			bf.close();
			ob.close();
			
			
		}
		catch(Exception e)
		{
			System.out.println(e.toString());
			e.printStackTrace();
		}
		
	}
	
	public static void main(String args[])
	{
		
		ConfigReader.read(args[0]);
		stemmer = new Stemmer();
		MAR = new MorphologicalAnalyzerRuleReader();
		ma = new MorphologicalAnalyzer();
		vgi = new VerbGroupIdentifier();
		//tvg = new TestVGI();
		vg = new VerbGroup7();
		CreateFile(args[1].trim(),args[2].trim(),args[3].trim(),args[4].trim(),args[5].trim());
	}
}