package iitb.cfilt.cpost.crfpp;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.*;
import iitb.cfilt.cpost.lexicon.LexReader;
import iitb.cfilt.cpost.lexicon.LexRuleReader;
import iitb.cfilt.cpost.ConfigReader;
import iitb.cfilt.cpost.dmstemmer.MorphOutput2;
import iitb.cfilt.cpost.dmstemmer.MorphResult;
import iitb.cfilt.cpost.dmstemmer.NewStemmer;
import iitb.cfilt.cpost.dmstemmer.MAResult;
import iitb.cfilt.cpost.UTFWriter;
import iitb.cfilt.cpost.vgi.*;

public class DMFileFormer
{
	private static VerbGroup9 vg;
	
	public static Vector<String> categories;
	public static Vector<String> tagcats;
	public static Vector<String> distinctTags;
	
	private static boolean noNN_NNP_Distinction = true; // It is true when NN-NNP distinction is not required
	private static boolean VGI = true; // It is true when VGI is to be applied. By Nikhilesh 
	private static boolean doPoSTagging = true;
	private static boolean useMorphFeatures = true;
	
	
	public static Vector<String> getDistinctTags(){
		Vector<String> distinctTags = new Vector<String>();

		try
		{
			String tagFile = ConfigReader.get("CRF.tagfile");
			BufferedReader bf = new BufferedReader(new InputStreamReader(new FileInputStream(tagFile), "UTF8"));
			String line = "";
			while((line = bf.readLine()) != null)
			{
				// ** Remove multiple spaces /*String[]
				String[] vals = line.split(" ");
				StringBuilder sb = new StringBuilder();
				for (String s : vals) { 
					if (s.trim().length() > 0) {
						sb.append(s).append(" "); 
					} 
				} 
				String new_line = sb.toString().trim();
				line = new_line;
				distinctTags.add(line.trim());
			}
			bf.close();
		}catch(Exception e){
			e.printStackTrace();
		}

		return distinctTags;
	}

	public static void setCategoriesAndTagmaps(Vector<String> categories, Vector<String> tagcats) throws Exception
	{
		String tagCatMapFile = ConfigReader.get("HMM.tagCat");
		BufferedReader bf = new BufferedReader(new InputStreamReader(new FileInputStream(tagCatMapFile), "UTF8"));
		String line = "";
		while((line = bf.readLine()) != null)
		{
			String[] temp = line.split("-");
			categories.add(temp[1].trim());
			tagcats.add(temp[0].trim());
		}
		bf.close();
	}
	
	// Delete existing file, if any, and create new.
	public static File forceCreateFile(String fileName) throws Exception
	{
		File file = new File(fileName);
		if(file.exists())
		{
			file.delete();
			file = new File(fileName);
		}
		file.createNewFile();
		
		return file;
	}
	
	public static UTFWriter createUTFWriter(String fileName) throws Exception
	{
		File file = forceCreateFile(fileName);
		UTFWriter ob = new UTFWriter(file);
		
		return ob;
	}
	
	public static String extractBaseTagFromCompundTag(String intag)
	{
		String tag = intag;
		
		if(intag.equals("NNPC"))
			tag = "NNP";
		else if(intag.equals("NNC"))
			tag = "NN";
		else if(intag.equals("JJC"))
			tag = "JJ";
		
		return tag;
	}
	
	public static String editTagIfNeeded(String intag)
	{
		String tag = extractBaseTagFromCompundTag(intag);
		
		if(noNN_NNP_Distinction && tag.equals("NNP")){
			// Note NNPC is already converted to NNP in extractBaseTagFromCompundTag
				tag = "NN";
		}
	
		return tag;
	}
	
	public static Vector<String> getCategoriesForToken(MAResult mar)
	{
		
		Vector<String> Categories = new Vector<String>();
		
		//System.out.println("Token : " + mar.getToken() + "Total Outputs : " + mar.getMorphOutputs().size());
		for(int j=0; j<mar.getMorphOutputs().size(); j++)
		{
			//System.out.print(" +"+mar.getMorphOutputs().get(j).getStemmerResult().getCategory()+"+ ");
			if(!Categories.contains(mar.getMorphOutputs().get(j).getStemmerResult().getCategory()))
			{
				//System.out.print(" +"+mar.getMorphOutputs().get(j).getStemmerResult().getCategory()+"+ ");
				Categories.add(mar.getMorphOutputs().get(j).getStemmerResult().getCategory());
			}
		}

		Collections.sort(Categories);

		return Categories;
	}
	
	public static boolean nonEmpty(Vector<Object> vec){
		return vec != null && !vec.isEmpty();
	}
	
	public static boolean nonEmptyMorphOutput(Vector<MorphOutput2> mroutputs){
		return mroutputs!=null && !mroutputs.isEmpty() && mroutputs.get(0).getMorphAnalysis() != null && !mroutputs.get(0).getMorphAnalysis().isEmpty() ;
	}
	
	public static String processFeature(String inFeature){
		String feature = inFeature;
		if(inFeature == null || inFeature.trim().equals(""))
		{
			feature = "Null";
		}
		feature = feature.replaceAll(" ", "");
		
		return feature;
	}
	
	public static String getMorphFeatureString(Vector<MorphOutput2> mroutputs){
	
		String morphFeatures = "null\tnull\tnull\tnull\tnull\tnull\tnull";
		
		if(useMorphFeatures){
			if(nonEmptyMorphOutput(mroutputs)){
				MorphResult mRes = mroutputs.get(0).getMorphAnalysis().get(0);
		
				String aspect = processFeature(mRes.getAspect());
				String cas = processFeature(mRes.getCAse());
				String gender = processFeature(mRes.getGender());
				String mood = processFeature(mRes.getMode());
				String number = processFeature(mRes.getNumber());
				String person = processFeature(mRes.getPerson());
				String tense = processFeature(mRes.getTense());
		
				morphFeatures = aspect + "\t" + cas + "\t" + gender + "\t" + mood + "\t" + number + "\t" + person + "\t" + tense;
			}
		}
		return morphFeatures;
	}

	public static String getSuffix(Vector<MorphOutput2> mroutputs){
		String suffix = "NoSuff";
		
		if(nonEmptyMorphOutput(mroutputs) && mroutputs.get(0).getStemmerResult()!=null){
				suffix = mroutputs.get(0).getStemmerResult().getSuffix();
				if(suffix == null || suffix.trim().equals("")){
					suffix = "NoSuff";
				}
		}
		return suffix;
	}

	public static String getSuffixLenPrefixString(String curr_word){
		int len = curr_word.length();
		String s1="No",s2="No",s3="No",s4="No";
		String p1="No",p2="No",p3="No",p4="No";
		if(len >= 4)
		{
			s1 = curr_word.substring(len-1);
			s2 = curr_word.substring(len-2);
			s3 = curr_word.substring(len-3);
			s4 = curr_word.substring(len-4);
			p1 = curr_word.substring(0,1);
			p2 = curr_word.substring(0,2);
			p3 = curr_word.substring(0,3);
			p4 = curr_word.substring(0,4);
		}	
		else if(len >= 3)
		{
			s1 = curr_word.substring(len-1);
			s2 = curr_word.substring(len-2);
			s3 = curr_word.substring(len-3);
			p1 = curr_word.substring(0,1);
			p2 = curr_word.substring(0,2);
			p3 = curr_word.substring(0,3);
		}
		else if(len >= 2)
		{
			s1 = curr_word.substring(len-1);
			s2 = curr_word.substring(len-2);
			p1 = curr_word.substring(0,1);
			p2 = curr_word.substring(0,2);
		}
		else if(len >= 1)
		{
			s1 = curr_word.substring(len-1);
			p1 = curr_word.substring(0,1);
		}
		String combined_suffixes = s1 + "\t" + s2 + "\t" + s3 + "\t" + s4 + "\t";
		String combined_prefixes = p1 + "\t" + p2 + "\t" + p3 + "\t" + p4 + "\t";

		return combined_suffixes + Integer.toString(len) + "\t" + combined_prefixes;
	}

	public static void CreateFile(String inFile, String outFile, String flag, String flagVGI, String flagtags)
	{
		CreateFile(inFile, outFile, flag, flagVGI, flagtags, "true");
	}
	
	/**
	 * This function is used to create the training file required for the CRF++ package.
	 * @param inFile This is the input file with tagged data
	 * @param outFile This file is created for using as a training file for CRF++. It contains multiple columns for creating features.
	 * @param flag This is to set NNFlag (see above) // Nikhilesh //
	 * @param flagVGI This is to set VGIFlag (see above) // Nikhilesh //
	 * 
	 */
		
	public static void CreateFile(String inFile, String outFile, String flag, String flagVGI, String flagtags, String flagUseMorphFeatures)
	{
		
		LexReader lr = new LexReader();
		LexRuleReader lrr = new LexRuleReader();		
		NewStemmer stemmerDM = new NewStemmer(lr, lrr);
		try
		{
			noNN_NNP_Distinction = flag.equalsIgnoreCase("true");			
			VGI = flagVGI.equalsIgnoreCase("true");
			doPoSTagging = flagtags.equalsIgnoreCase("true");
			useMorphFeatures = flagUseMorphFeatures.equalsIgnoreCase("true");
			
			distinctTags = getDistinctTags();
			
			categories = new Vector<String>();
			tagcats = new Vector<String>();
			setCategoriesAndTagmaps(categories, tagcats);			
			
//			String dmfilename = inFile + "_accuracyReport";
//			File dmfile = new File(dmfilename);
//			if(dmfile.exists())
//			{
//				dmfile.delete();
//				dmfile = new File(outFile);
//			}
//			dmfile.createNewFile();
//			UTFWriter ob_ar = new UTFWriter(dmfile);
				
			UTFWriter ob = createUTFWriter(outFile);
			
			float rVaux=0,pVaux=0,nVaux=0;
			float rVm=0,pVm=0,nVm=0;
			
			BufferedReader bf = new BufferedReader(new InputStreamReader(new FileInputStream(inFile), "UTF8"));
			String line = "";
			while((line = bf.readLine()) != null)
			{
				if(line.length() != 0)
				{

					Vector<String> words = new Vector<String>();
					Vector<String> labels = new Vector<String>();
					String[] taggedWords = line.split("\\]");
					
					Vector<MAResult> mrv = new Vector<MAResult>();
					String token;
					MAResult mr;
					for(int i=0;i<taggedWords.length;i++)
					{
						if(taggedWords[i].contains("_"))
						{
							// Extract Token
							String[] taggedWordComponents = taggedWords[i].split("_\\[");
							if(taggedWordComponents[0].trim()!="")
							 token = taggedWordComponents[0].trim();
							else token=".";
							words.add(token);

							// Extract Tag
							String tag = taggedWordComponents[1].trim();
							tag = editTagIfNeeded(tag);
							labels.add(tag);														

							//Stem Token
							mr = stemmerDM.stem(token);
							mr.printDetail();
							mrv.add(mr);
						} else {
							System.out.println("XXError: no _ found in word " + taggedWords[i]);
						}
					}
					
					
					if(!doPoSTagging){
						continue;
					}
				
					//VGI being applied if the flag is set
					int [] vga;
					Vector <MAResult> maresult;
					maresult = mrv;
					
					if (VGI){
						vga = vg.identifyVerbGroups1(maresult);
					} else vga = null;
					
					for(int i=0; i<mrv.size(); i++)
					{
						Vector<String> Categories = getCategoriesForToken(mrv.get(i));
					
						String AS = "";
						
						Vector<Integer> catFlags = new Vector<Integer>();
						for(int j=0;j<categories.size();j++)
							catFlags.add(0);

						if(labels.get(i).equals("VAUX")){
							rVaux++;										
						}
						if(labels.get(i).equals("VM")){
							rVm++;										
						}
						
						/* Start Extra Features : Nikhilesh */
						Vector<MorphOutput2> mroutputs = mrv.get(i).getMorphOutputs();
						String suffix = getSuffix(mroutputs);
						String morphFeatures = getMorphFeatureString(mroutputs);
						
						if(mrv.get(i).getMorphOutputs()!=null && !mrv.get(i).getMorphOutputs().isEmpty() && mrv.get(i).getMorphOutputs().get(0).getDetailSuffixesAnalysis()!=null && !mrv.get(i).getMorphOutputs().get(0).getDetailSuffixesAnalysis().isEmpty() && mrv.get(i).getMorphOutputs().get(0).getDetailSuffixesAnalysis().get(0)!=null)
						{
							String suff = mrv.get(i).getMorphOutputs().get(0).getDetailSuffixesAnalysis().get(0).getSuffix();
							if(suff == null || suff.trim().equals(""))
							{
								suff = "Null";
							}
							morphFeatures = morphFeatures + "\t" + suff;
						}
						else
						{
							morphFeatures = morphFeatures + "\t" + "null";
						}
						/* End Extra Features : Nikhilesh */
					
						//System.out.println("Amb. Sc. : " + mrv.get(i).getAmbiguityScheme());
						if(VGI && vga[i]>0)
						{
							if(vga[i]>1){
//								if(vga[i]==100)
//								{
//									AS = "noun";
//									catFlags.set(categories.indexOf("noun"),1);
//								}
//								else
//								{
								pVaux++;
								if(labels.get(i).equals("VAUX")){
									nVaux++;
								}
								AS = "verb_aux";
								catFlags.set(categories.indexOf("verb_aux"),1);
//								}
							}
							else{
								pVm++;
								if(labels.get(i).equals("VM")){
									nVm++;
								}
								AS = "verb";
								catFlags.set(categories.indexOf("verb"),1);
							}
						}else{
							AS = "";
							for(int j=0;j<Categories.size();j++){
								AS = AS + Categories.get(j) + "*";
								//System.out.println("cats : " + Categories.get(j) + ", AS : " + AS);
								catFlags.set(categories.indexOf(Categories.get(j)),1);
							}
							int l = AS.length();
							if(l>0){ // Remove the last "*"
								AS = AS.substring(0,l-1);
							}
						}
						
						if(AS ==  null || AS.trim().equals("")){
							AS = "unknown";
							if(lr.symbols.contains( mrv.get(i).getToken() )){
									AS = "SYM";
							}
						}
						
						String cats = "";
						for(int j=0;j<categories.size();j++)
							cats = cats + Integer.toString(catFlags.get(j)) + "\t";

						//**********************************
						//Writing this tab seperated line as one training instance in the output file.
						//System.out.println("");
						//Sachin's features: System.out.println("Written = " + words.get(i) + "\t" + AS + "\t" + cats + suffix + "\t" + combined_suffixes + Integer.toString(len) + "\t" + combined_prefixes + morphFeatures + "\t" + labels.get(i) + "\n");
						String crfFeatureString = words.get(i) + "\t" + AS + "\t" + cats + suffix + "\t" + getSuffixLenPrefixString(words.get(i)) + morphFeatures + "\t" + labels.get(i) + "\n"; 
						ob.writeUTF(crfFeatureString);
					}
				}
				ob.writeUTF("\n");
			}
			//System.out.println("Done!");
			bf.close();
			//ob_ar.close();
			ob.close();
		}
		catch(Exception e)
		{
			//System.out.println(e.toString());
			e.printStackTrace();
		}
	}

	public static void main(String args[])
	{
		ConfigReader.read(args[0]);
		vg = new VerbGroup9();
		CreateFile(args[1].trim(),args[2].trim(),args[3].trim(),args[4].trim(),args[5].trim(),args[6].trim());
	}
}