package iitb.cfilt.cpost.crfpp;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.*;
import iitb.cfilt.cpost.lexicon.LexReader;
import iitb.cfilt.cpost.lexicon.LexRuleReader;
import iitb.cfilt.cpost.ConfigReader;
import iitb.cfilt.cpost.dmstemmer.MorphResult;
import iitb.cfilt.cpost.dmstemmer.NewStemmer;
import iitb.cfilt.cpost.dmstemmer.MAResult;
import iitb.cfilt.cpost.UTFWriter;
import iitb.cfilt.cpost.vgi.*;

public class DMFileFormer
{
	private static VerbGroup9 vg;
	
	public static Vector<String> categories;
	public static Vector<String> tagcats;
	public static Vector<String> distinctTags;
	
	private static boolean noNN_NNP_Distinction = true; // It is true when NN-NNP distinction is not required
	private static boolean VGI = true; // It is true when VGI is to be applied. By Nikhilesh 
	private static boolean tags = true;
	
	
	public static Vector<String> getDistinctTags(){
		Vector<String> distinctTags = new Vector<String>();

		try
		{
			String tagFile = ConfigReader.get("CRF.tagfile");
			BufferedReader bf = new BufferedReader(new InputStreamReader(new FileInputStream(tagFile), "UTF8"));
			String line = "";
			while((line = bf.readLine()) != null)
			{
				// ** Remove multiple spaces /*String[]
				String[] vals = line.split(" ");
				StringBuilder sb = new StringBuilder();
				for (String s : vals) { 
					if (s.trim().length() > 0) {
						sb.append(s).append(" "); 
					} 
				} 
				String new_line = sb.toString().trim();
				line = new_line;
				distinctTags.add(line.trim());
			}
			bf.close();
		}catch(Exception e){
			e.printStackTrace();
		}

		return distinctTags;
	}

	public static void setCategoriesAndTagmaps(Vector<String> categories, Vector<String> tagcats) throws Exception
	{
		String tagCatMapFile = ConfigReader.get("HMM.tagCat");
		BufferedReader bf = new BufferedReader(new InputStreamReader(new FileInputStream(tagCatMapFile), "UTF8"));
		String line = "";
		while((line = bf.readLine()) != null)
		{
			String[] temp = line.split("-");
			categories.add(temp[1].trim());
			tagcats.add(temp[0].trim());
		}
		bf.close();
	}
	
	// Delete existing file, if any, and create new.
	public static File forceCreateFile(String fileName) throws Exception
	{
		File file = new File(fileName);
		if(file.exists())
		{
			file.delete();
			file = new File(fileName);
		}
		file.createNewFile();
		
		return file;
	}
	
	public static UTFWriter createUTFWriter(String fileName) throws Exception
	{
		File file = forceCreateFile(fileName);
		UTFWriter ob = new UTFWriter(file);
		
		return ob;
	}
	
	public static String extractBaseTagFromCompundTag(String intag)
	{
		String tag = intag;
		
		if(intag.equals("NNPC"))
			tag = "NNP";
		else if(intag.equals("NNC"))
			tag = "NN";
		else if(intag.equals("JJC"))
			tag = "JJ";
		
		return tag;
	}
	/**
	 * This function is used to create the training file required for the CRF++ package.
	 * @param inFile This is the input file with tagged data
	 * @param outFile This file is created for using as a training file for CRF++. It contains multiple columns for creating features.
	 * @param flag This is to set NNFlag (see above) // Nikhilesh //
	 * @param flagVGI This is to set VGIFlag (see above) // Nikhilesh //
	 * 
	 */
		
	public static void CreateFile(String inFile, String outFile, String flag, String flagVGI, String flagtags)
	{
		
		LexReader lr = new LexReader();
		LexRuleReader lrr = new LexRuleReader();		
		NewStemmer stemmerDM = new NewStemmer(lr, lrr);
		try
		{
			noNN_NNP_Distinction = flag.equalsIgnoreCase("true");			
			VGI = flagVGI.equalsIgnoreCase("true");
			tags = flagtags.equalsIgnoreCase("true");
			
			distinctTags = getDistinctTags();
			
			categories = new Vector<String>();
			tagcats = new Vector<String>();
			setCategoriesAndTagmaps(categories, tagcats);			
			
//			String dmfilename = inFile + "_accuracyReport";
//			File dmfile = new File(dmfilename);
//			if(dmfile.exists())
//			{
//				dmfile.delete();
//				dmfile = new File(outFile);
//			}
//			dmfile.createNewFile();
//			UTFWriter ob_ar = new UTFWriter(dmfile);
				
			UTFWriter ob = createUTFWriter(outFile);
			
			float rVaux=0,pVaux=0,nVaux=0;
			float rVm=0,pVm=0,nVm=0;
			
			BufferedReader bf = new BufferedReader(new InputStreamReader(new FileInputStream(inFile), "UTF8"));
			String line = "";
			while((line = bf.readLine()) != null)
			{
				if(line.length() != 0)
				{

					Vector<String> words = new Vector<String>();
					Vector<String> labels = new Vector<String>();
					String[] taggedWords = line.split("\\]");
					
					Vector<MAResult> mrv = new Vector<MAResult>();
					String token;
					MAResult mr;
					for(int i=0;i<taggedWords.length;i++)
					{
						if(taggedWords[i].contains("_"))
						{
							String[] taggedWordComponents = taggedWords[i].split("_\\[");
							if(taggedWordComponents[0].trim()!="")
							 token = taggedWordComponents[0].trim();
							else token=".";
							String tag = taggedWordComponents[1].trim();
							tag = extractBaseTagFromCompundTag(tag);
							
							if(noNN_NNP_Distinction){
								if(tag.equals("NNP")) // Note NNPC is already converted to NNP in extractBaseTagFromCompundTag
									tag = "NN";
							}
							
							//Each word is stemmed first.
							mr = stemmerDM.stem(token);
							mr.printDetail();
							mrv.add(mr);
							words.add(token);
							labels.add(tag);
						}
					}
					
					if(tags)
					{
						//VGI being applied if the flag is set
						int [] vga;
						Vector <MAResult> maresult;
						maresult = mrv;
						vga = vg.identifyVerbGroups1(maresult);
					
						for(int i=0; i<mrv.size(); i++)
						{
							Vector<String> Categories = new Vector<String>();
							//System.out.println("Token : " + mrv.get(i).getToken() + "Total Outputs : " + mrv.get(i).getMorphOutputs().size());
							for(int j=0; j<mrv.get(i).getMorphOutputs().size(); j++)
							{
								//System.out.print(" +"+mrv.get(i).getMorphOutputs().get(j).getStemmerResult().getCategory()+"+ ");
								if(!Categories.contains(mrv.get(i).getMorphOutputs().get(j).getStemmerResult().getCategory()))
								{
									//System.out.print(" +"+mrv.get(i).getMorphOutputs().get(j).getStemmerResult().getCategory()+"+ ");
									Categories.add(mrv.get(i).getMorphOutputs().get(j).getStemmerResult().getCategory());
								}
							}
							Collections.sort(Categories);
							String AS = "";
							Vector<Integer> catFlags = new Vector<Integer>();
							for(int j=0;j<categories.size();j++)
							catFlags.add(0);
							if(labels.get(i).equals("VAUX"))
							{
								rVaux++;										
							}
							if(labels.get(i).equals("VM"))
							{
								rVm++;										
							}
							
							/* Start Extra Features : Nikhilesh */
							String suffix = "NoSuff";
							String morphFeatures = "";
							if(mrv.get(i).getMorphOutputs()!=null && !mrv.get(i).getMorphOutputs().isEmpty() && mrv.get(i).getMorphOutputs().get(0).getMorphAnalysis() != null && !mrv.get(i).getMorphOutputs().get(0).getMorphAnalysis().isEmpty())
							{
								MorphResult mRes = mrv.get(i).getMorphOutputs().get(0).getMorphAnalysis().get(0);
								String aspect = mRes.getAspect();
								String cas = mRes.getCAse();
								String gender = mRes.getGender();
								String mood = mRes.getMode();
								String number = mRes.getNumber();
								String person = mRes.getPerson();
								String tense = mRes.getTense();
								if(aspect == null || aspect.trim().equals(""))
								{
									aspect = "Null";
								}
								if(cas == null || cas.trim().equals(""))
								{
									cas = "Null";
								}
								if(gender == null || gender.trim().equals(""))
								{
									gender = "Null";
								}
								if(mood == null || mood.trim().equals(""))
								{
									mood = "Null";
								}
								if(number == null || number.trim().equals(""))
								{
									number = "Null";
								}
								if(person == null || person.trim().equals(""))
								{
									person = "Null";
								}
								if(tense == null || tense.trim().equals(""))
								{
									tense = "Null";
								}
								aspect = aspect.replaceAll(" ", "");
								cas = cas.replaceAll(" ", "");
								gender = gender.replaceAll(" ", "");
								mood = mood.replaceAll(" ", "");
								number = number.replaceAll(" ", "");
								person = person.replaceAll(" ", "");
								tense = tense.replaceAll(" ", "");
								morphFeatures = aspect + "\t" + cas + "\t" + gender + "\t" + mood + "\t" + number + "\t" + person + "\t" + tense;
								if(mrv.get(i).getMorphOutputs().get(0).getStemmerResult()!=null)
								{
									suffix = mrv.get(i).getMorphOutputs().get(0).getStemmerResult().getSuffix();
									if(suffix == null || suffix.trim().equals(""))
									{
										suffix = "NoSuff";
									}
								}
							}
							else
							{
								morphFeatures = "null\tnull\tnull\tnull\tnull\tnull\tnull";
							}
							if(mrv.get(i).getMorphOutputs()!=null && !mrv.get(i).getMorphOutputs().isEmpty() && mrv.get(i).getMorphOutputs().get(0).getDetailSuffixesAnalysis()!=null && !mrv.get(i).getMorphOutputs().get(0).getDetailSuffixesAnalysis().isEmpty() && mrv.get(i).getMorphOutputs().get(0).getDetailSuffixesAnalysis().get(0)!=null)
							{
								String suff = mrv.get(i).getMorphOutputs().get(0).getDetailSuffixesAnalysis().get(0).getSuffix();
								if(suff == null || suff.trim().equals(""))
								{
									suff = "Null";
								}
								morphFeatures = morphFeatures + "\t" + suff;
							}
							else
							{
								morphFeatures = morphFeatures + "\t" + "null";
							}
							if(suffix==null)
							{
								suffix = "NoSuff";
							}
							/* End Extra Features : Nikhilesh */
						
							//System.out.println("Amb. Sc. : " + mrv.get(i).getAmbiguityScheme());
							if(VGI && vga[i]>0)
							{
								if(vga[i]>1)
								{
									if(vga[i]==100)
									{
										AS = "noun";
										catFlags.set(categories.indexOf("noun"),1);
									}
									else
									{
										pVaux++;
										if(labels.get(i).equals("VAUX"))
										{
											nVaux++;
										}
										AS = "verb_aux";
										catFlags.set(categories.indexOf("verb_aux"),1);
									}
								}
								else
								{
									pVm++;
									if(labels.get(i).equals("VM"))
									{
										nVm++;
									}
									AS = "verb";
									catFlags.set(categories.indexOf("verb"),1);
								}
							}
							else
							{
								for(int j=0;j<Categories.size();j++)
								{
									AS = AS + Categories.get(j) + "*";
									//System.out.println("cats : " + Categories.get(j) + ", AS : " + AS);
									catFlags.set(categories.indexOf(Categories.get(j)),1);
								}
								int l = AS.length();
								if(l>0)
								{
									AS = AS.substring(0,l-1);
								}
							}
							if(AS ==  null || AS.trim().equals(""))
							{
								String symbolsfile = ConfigReader.get("Lexicon.symbols");
								//System.out.println("Symbols File = " + symbolsfile);
								BufferedReader bfs = new BufferedReader(new InputStreamReader(new FileInputStream(symbolsfile), "UTF8"));
								line = "";
								AS = "unknown";
								while((line = bfs.readLine()) != null)
								{
									if(mrv.get(i).getToken().contentEquals(line))
										AS = "SYM";
								}
								bfs.close();
							}
							String cats = "";
							for(int j=0;j<categories.size();j++)
								cats = cats + Integer.toString(catFlags.get(j)) + "\t";
							String curr_word = words.get(i);
							int len = curr_word.length();
							String s1="No",s2="No",s3="No",s4="No";
							String p1="No",p2="No",p3="No",p4="No";
							if(len >= 4)
							{
								s1 = curr_word.substring(len-1);
								s2 = curr_word.substring(len-2);
								s3 = curr_word.substring(len-3);
								s4 = curr_word.substring(len-4);
								p1 = curr_word.substring(0,1);
								p2 = curr_word.substring(0,2);
								p3 = curr_word.substring(0,3);
								p4 = curr_word.substring(0,4);
							}	
							else if(len >= 3)
							{
								s1 = curr_word.substring(len-1);
								s2 = curr_word.substring(len-2);
								s3 = curr_word.substring(len-3);
								p1 = curr_word.substring(0,1);
								p2 = curr_word.substring(0,2);
								p3 = curr_word.substring(0,3);
							}
							else if(len >= 2)
							{
								s1 = curr_word.substring(len-1);
								s2 = curr_word.substring(len-2);
								p1 = curr_word.substring(0,1);
								p2 = curr_word.substring(0,2);
							}
							else if(len >= 1)
							{
								s1 = curr_word.substring(len-1);
								p1 = curr_word.substring(0,1);
							}
							String combined_suffixes = s1 + "\t" + s2 + "\t" + s3 + "\t" + s4 + "\t";
							String combined_prefixes = p1 + "\t" + p2 + "\t" + p3 + "\t" + p4 + "\t";
							//**********************************
							//Writing this tab seperated line as one training instance in the output file.
							//System.out.println("");
							//System.out.println("Written = " + words.get(i) + "\t" + AS + "\t" + cats + suffix + "\t" + combined_suffixes + Integer.toString(len) + "\t" + combined_prefixes + morphFeatures + "\t" + labels.get(i) + "\n");
							ob.writeUTF(words.get(i) + "\t" + AS + "\t" + cats + suffix + "\t" + combined_suffixes + Integer.toString(len) + "\t" + combined_prefixes + morphFeatures + "\t" + labels.get(i) + "\n");
						}
					}
				}
				ob.writeUTF("\n");
			}
			//ob_ar.writeUTF(("\nVM Precision : " + nVm/pVm + ", VM Recall : " + nVm/rVm + ", Vaux Precision : " + nVaux/pVaux + ", Vaux Recall : " + nVaux/rVaux + "\n"));
			//System.out.println("Done!");
			bf.close();
			//ob_ar.close();
			ob.close();
		}
		catch(Exception e)
		{
			//System.out.println(e.toString());
			e.printStackTrace();
		}
	}
	public static void main(String args[])
	{
		ConfigReader.read(args[0]);
		vg = new VerbGroup9();
		CreateFile(args[1].trim(),args[2].trim(),args[3].trim(),args[4].trim(),args[5].trim());
	}
}