package iitb.cfilt.cpost.newstemmer;

import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectInput;
import java.io.ObjectInputStream;
import java.io.ObjectOutput;
import java.io.ObjectOutputStream;
import java.util.*;
import java.io.*;
import java.sql.Timestamp;

import iitb.cfilt.cpost.UTFWriter;
import iitb.cfilt.cpost.ConfigReader;
import iitb.cfilt.cpost.lexicon.WordProperties;
import iitb.cfilt.cpost.lexicon.Wordlist;
//import iitb.cfilt.cpost.logger.Logger;
import iitb.cfilt.cpost.tokenizer.TaggedTokenizer;
import iitb.cfilt.cpost.utils.AccuracyReportWriter;
import iitb.cfilt.cpost.utils.UTFConsole;
import iitb.cfilt.cpost.ma.*;


public class Stemmer {
	//private Wordlist wordlist;
	//private StemmerRuleReader stemmerRuleReader;

	//Flags
	static boolean mmAtStemmer = true;
	static boolean useStemDictionary = false;
	static boolean populated;
	static boolean useNounList = false;

	public static HashMap<String, StemmedToken> stemDictionary;
	public static StemmerRuleReader stemmerRuleReader;

	
	public static Stemmer stemObj;

	public static Stemmer getInstance(String lang, String config, String baseDir) {
		if (stemObj == null) {
			stemObj = new Stemmer(lang, config, baseDir);
			return stemObj;
		} else {
			return stemObj;
		}
	}
	
	@SuppressWarnings("unchecked")
	public Stemmer(){
		//wordlist = new Wordlist();
		if(populated == true)
			return;
		System.out.println("************Resource Reading********");
		boolean fileFound = true;
		try {
			ObjectInput oin = new ObjectInputStream(new FileInputStream(ConfigReader.get("Stemmer.stemDictionary")));
			stemDictionary = (HashMap<String, StemmedToken>) oin.readObject();
		} catch (FileNotFoundException e) {
//			e.printStackTrace();
			fileFound = false;
		} catch (IOException e) {
			e.printStackTrace();
		} catch (ClassNotFoundException e) {
			e.printStackTrace();
		}
		if(!fileFound || !useStemDictionary){
			stemDictionary = new HashMap<String, StemmedToken>();
		}
		Wordlist.populate();
		stemmerRuleReader = new StemmerRuleReader();
		stemmerRuleReader.populate();
		populated = true;
	}

	public Stemmer(String lang, String config, String baseDir) // This version of constructor is used in the CLIA language analyzer
	{
		//if(populated == true)
		//	return;
		//System.out.println("************Resource Reading********");
		ConfigReader.read(config);
		Wordlist.populate(baseDir);
		stemmerRuleReader = new StemmerRuleReader();
		stemmerRuleReader.populate(baseDir);
		useNounList = Boolean.parseBoolean(ConfigReader.get("NewStemmer.useNounList"));
		//populated = true;
	}
	
	static void writeStemDictionary(){
		try {
			ObjectOutput oout = new ObjectOutputStream(new FileOutputStream(ConfigReader.get("Stemmer.stemDictionary")));
			oout.writeObject(stemDictionary);
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
	}

	
	
	public Vector<StemmedToken> stem (Vector<String> tokens){
		Vector<StemmedToken> retVal = new Vector<StemmedToken>(tokens.size());
		for(int i = 0; i < tokens.size(); i++){
			StemmedToken currentStemmedToken = stem(tokens.get(i));
			if(currentStemmedToken != null){
				retVal.add(currentStemmedToken);
			}
		}
		if(retVal.size() == 0){
			retVal = null;
		}
		if(useStemDictionary){
			//writeStemDictionary();
		}
		return retVal;
	}
	
	/**
	 * This function is used to stem the input word. Uses stem1 function internally for all specified spelling variations.
	 * 
	 * @param token The token which is to be stemmed
	 * @return A <code>StemmedToken</code> with multiple stemmer results
	 */
	
	public StemmedToken stem(String token)
	{
		StemmedToken retVal = stem1(token);
		if(retVal.getStemmedOutputs().get(0).getCategory().trim().equals("unknown"))
		{
			Vector<String> spellingVariationsVector = stemmerRuleReader.getSpellingVariations(token);
			if(spellingVariationsVector != null)
			{
				for(int i = 0; i < spellingVariationsVector.size(); i++)
				{
					String currentSpellingVariation = spellingVariationsVector.get(i);
					StemmedToken st = stem1(currentSpellingVariation);
					if(!st.getStemmedOutputs().get(0).getCategory().trim().equals("unknown"))
					{
						retVal.resetStemmedOutputs();
						retVal.addAllStemmerOutputs(st.getStemmedOutputs());
					}
				}
			}
		}
		return(retVal);
	}
	
	/**
	 * This function is used to stem the input word
	 * 
	 * @param token The token which is to be stemmed
	 * @return A <code>StemmedToken</code> with multiple stemmer results
	 */

	public StemmedToken stem1(String token){
		
		
		String temp_token = token;
		
		boolean flag = false;
		if(token.contains("-") && token.length() > 1) // Checking whether the word has hyphen
		{
			int p = token.indexOf('-');
			if(p != 0)
			{
				token = token.substring(0,p);
				flag = true;
			}
		}
		StemmedToken retVal = null;
		retVal = new StemmedToken(token);
		
		//Checking whether the word is a special charater or number
		StemmerRuleResult specialCharacterResult = stemmerRuleReader.applySpecialCharacterRulesOn(token);
		if(specialCharacterResult != null){
			retVal.addStemmerOutput(specialCharacterResult);
			return(retVal);
		}
		
		//Checking whether the word is in the lexicon as it is.
		
		Vector<StemmerRuleResult> lexItems = checkInLexicon(token);
		if(lexItems != null){
			retVal.addAllStemmerOutputs(lexItems);
		}
		//System.out.println("Token = "+token+"\n");
		String firstchar = token.substring(0,1);
		//Set<String> categories;
		Set<String> categories = stemmerRuleReader.getCategories();
		/*if(Wordlist.firstcharCatParHash.containsKey(firstchar))
			categories = Wordlist.firstcharCatParHash.get(firstchar).keySet();
		else
		{
			if(retVal.getStemmedOutputs().size() == 0)
			{
				StemmerRuleResult srr = new StemmerRuleResult(token,"unknown","unknown","","");
				retVal.addStemmerOutput(srr);
			}
			return(retVal);
		}*/
		Iterator<String> catIter = categories.iterator();
		
		//Following loop checks for each category in turn.
		
		while(catIter.hasNext())
		{
			String cat = catIter.next();
			//Checking if type 3 (charam) suffix can be removed.
			Vector<StemmerRuleResult> srrv = stemmerRuleReader.removeCharamSuffix(token,cat);
			if(srrv != null)
			{
				retVal.addAllStemmerOutputs(srrv);
				continue;
			}
			//Apply all possible Suffix Replacement Rules.
			srrv = stemmerRuleReader.applySRRs(token,cat);
			retVal.addAllStemmerOutputs(srrv);
		}
		
		//Checking whether the word has some particle attached. If it is so, then repeat the above process by removing the suffix.
		String particleRemovedToken = stemmerRuleReader.removeParticle(token);
		if(!particleRemovedToken.equals(token))
		{
			Vector<StemmerRuleResult> lexItemsnew = checkInLexicon(particleRemovedToken);
			if(lexItemsnew != null){
				for(int z=0;z<lexItemsnew.size();z++)
				{
					//if(lexItemsnew.get(z).getCategory().trim().equals("demonstrative"))
					//	continue;
					retVal.addStemmerOutput(lexItemsnew.get(z));
				}
				//retVal.addAllStemmerOutputs(lexItemsnew);
			}
			catIter = categories.iterator();
			
			while(catIter.hasNext())
			{
				String cat = catIter.next();
				//if(cat.equals("particle") || cat.equals("demonstrative"))
				if(cat.equals("particle"))
					continue;
				Vector<StemmerRuleResult> srrv = stemmerRuleReader.removeCharamSuffix(particleRemovedToken,cat);
				if(srrv != null)
				{
					retVal.addAllStemmerOutputs(srrv);
					continue;
				}
				srrv = stemmerRuleReader.applySRRs(particleRemovedToken,cat);
				if(srrv != null)
					retVal.addAllStemmerOutputs(srrv);
			}
		}
		
		//retVal.addAllStemmerOutputs(stemmerRuleReader.);
		
		/*if(retVal.getStemmedOutputs().size() == 0)
		{
			Vector<String[]> derivationalMorphologyOutput = stemmerRuleReader.checkDerivationalMorphology(token);
			if(derivationalMorphologyOutput != null)
			{
				for(int i = 0; i < derivationalMorphologyOutput.size(); i++)
				{
					String[] properties = derivationalMorphologyOutput.get(i);
					String derivedRoot = properties[0];
					String derivedRootCategory = properties[1];
					String derivedNewCategory = properties[2];

					Vector<WordProperties> derivedRootWordProperties = Wordlist.searchWordlistFor(derivedRoot);
					if(derivedRootWordProperties != null)
					{
						for(int j = 0; j < derivedRootWordProperties.size(); j++)
						{
							WordProperties dwp = derivedRootWordProperties.get(j);
							if(dwp.getCategory().equals(derivedRootCategory))
							{
								StemmerRuleResult srresult = new StemmerRuleResult(derivedRoot, "", derivedNewCategory,"","");
								if(!retVal.getStemmedOutputs().contains(srresult))
									retVal.addStemmerOutput(srresult);
							}
						}
					}
				}
			}
		}*/
		
		if(retVal.getStemmedOutputs().size() == 0)
		{
			if(useNounList == false)
			{
				StemmerRuleResult srr = new StemmerRuleResult(token,"unknown","unknown","","");
				retVal.addStemmerOutput(srr);
			}
			else
			{
				//Trying to remove noun suffixes by guessing the most probable paradigm for it.
				//It is used in the CLIA to reduce the number of unknown words.
				//Currently for POS tagging, this is not used.
				Vector<StemmerRuleResult> srrv = stemmerRuleReader.checkForNounSuffixes(token);
				retVal.addAllStemmerOutputs(srrv);
			}
		}
		
		/*Vector<StemmerRuleResult> srrv = stemmerRuleReader.applySRRs(token,"noun");
		retVal.addAllStemmerOutputs(srrv);
		
		srrv = stemmerRuleReader.applySRRs(token,"verb");
		retVal.addAllStemmerOutputs(srrv);*/
		
		retVal.sortResults();
		if(flag == true)
		{
			retVal.setToken(temp_token);
		}
		
		return retVal;
	}
	
	/**
	 * This function is used to check whether input word is in the lexicon
	 * 
	 * @param token The token which is to be checked
	 * @return A Vector of <code>StemmeerRuleResult</code> is returned depending upon the matches found in the lexicon
	 */
	private Vector<StemmerRuleResult> checkInLexicon(String token) {
		Vector<StemmerRuleResult> retVal = new Vector<StemmerRuleResult>();
		Vector<WordProperties> tokenProperties = Wordlist.searchWordlistFor(token);
		if(tokenProperties != null){
			for(int i = 0; i < tokenProperties.size(); i++){
				WordProperties tokenProperty = tokenProperties.get(i);
//				System.out.println("Found in Lexicon (fullsearch): " + token + " lexiconParadigm : " + tokenProperty.getParadigm() + " Category : " + tokenProperty.getCategory());
				StemmerRuleResult srresult = new StemmerRuleResult(token, tokenProperty.getParadigm(), tokenProperty.getCategory(),"", "");
				if(!retVal.contains(srresult))
					retVal.add(srresult);
			}
		}
		
		//Checking for derivational morphology.
		
		if(retVal.size() == 0){
			Vector<String[]> derivationalMorphologyOutput = stemmerRuleReader.checkDerivationalMorphology(token);
			if(derivationalMorphologyOutput != null){
				for(int i = 0; i < derivationalMorphologyOutput.size(); i++){
					String[] properties = derivationalMorphologyOutput.get(i);
					String derivedRoot = properties[0];
					String derivedRootCategory = properties[1];
					String derivedNewCategory = properties[2];

					Vector<WordProperties> derivedRootWordProperties = Wordlist.searchWordlistFor(derivedRoot);
					if(derivedRootWordProperties != null){
						for(int j = 0; j < derivedRootWordProperties.size(); j++){
							WordProperties dwp = derivedRootWordProperties.get(j);
							if(dwp.getCategory().equals(derivedRootCategory)){
								StemmerRuleResult srresult = new StemmerRuleResult(derivedRoot, "", derivedNewCategory,"","");
								if(!retVal.contains(srresult))
									retVal.add(srresult);
							}
						}
					}
				}
			}

		}
		if(retVal.size() == 0){
			retVal = null;
		}
		return retVal;
	}

	public static void main(String args[]){
		
		ConfigReader.read(args[0]);
		Stemmer stemmer = new Stemmer();
		
		/*try {
			BufferedReader bsrr = new BufferedReader(new InputStreamReader(new FileInputStream("/home/sachin/MTP/Ashish.unstemmed"), "UTF8"));
			String line = "";
			File outfile = new File("/home/sachin/MTP/newAshish");
			if(outfile.exists())
			{
				outfile.delete();
				outfile = new File("/home/sachin/MTP/newAshish");
			}
			UTFWriter ob = new UTFWriter(outfile);
			while (line != null)
			{
				line = bsrr.readLine();
				if(line != null)
				{
					line = line.trim();
					if(line.length()!=0 && !line.startsWith("//")) // To ensure that line is not empty and line is not to be ignored.
					{
						String[] comps = line.split("\t");
						String token = comps[0].trim();
						StemmedToken st = stemmer.stem(token);
						//ob.writeUTF(token + "\t");
						for(int z=0;z<st.getStemmedOutputs().size();z++)
						{
							ob.writeUTF(token + "\t");
							ob.writeUTF(st.getStemmedOutputs().get(z).getRoot() + ":" + st.getStemmedOutputs().get(z).getCategory() + "\n");
						}
						ob.writeUTF("\n");
					}
					else
					{
						ob.writeUTF("\n");
						UTFConsole.out.println();
					}
						
				}
			}
			bsrr.close();
			ob.close();
		} catch (UnsupportedEncodingException e) {
			e.printStackTrace();
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
		*/
		/*
		Vector<String> unknowns = new Vector<String>();
		try {
			BufferedReader bsrr = new BufferedReader(new InputStreamReader(new FileInputStream("/home/sachin/workspace/HindiLinguisticResources/unknownWords1"), "UTF8"));
			String line = "";
			File outfile = new File("/home/sachin/workspace/HindiLinguisticResources/unknownWordsSorted1");
			if(outfile.exists())
			{
				outfile.delete();
				outfile = new File("/home/sachin/workspace/HindiLinguisticResources/unknownWordsSorted1");
			}
			UTFWriter ob = new UTFWriter(outfile);
			while (line != null)
			{
				line = bsrr.readLine();
				if(line != null)
				{
					line = line.trim();
					if(line.length()!=0 && !line.startsWith("//")) // To ensure that line is not empty and line is not to be ignored.
					{
						unknowns.add(line);
					}

				}
			}
			bsrr.close();
			
			Collections.sort(unknowns);
			
			for(int i=0;i<unknowns.size();i++)
			{
				ob.writeUTF(unknowns.get(i) + "\n");
			}
			
			ob.close();
			
			
		} catch (UnsupportedEncodingException e) {
			e.printStackTrace();
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
		*/
		
		
		/*try 
		{
			BufferedReader bsrr = new BufferedReader(new InputStreamReader(new FileInputStream("/home/sachin/MTP/StemmingResult"), "UTF8"));
			String line = "";
			Vector<Double> temp = new Vector<Double>();
			while (line != null)
			{
				line = bsrr.readLine();
				if(line != null)
				{
					line = line.trim();
					if(line.length()!=0 && !line.startsWith("//")) // To ensure that line is not empty and line is not to be ignored.
					{
						temp.add(Double.parseDouble(line));
					}

				}
			}
			
			double sum = 0.0;
			for(int i=0;i<temp.size();i++)
			{
				sum = sum + temp.get(i);
			}
			double avg = sum/(double)temp.size();
			
			System.out.println(avg);
			
			bsrr.close();
		}
		catch(Exception e)
		{
			System.out.println(e.toString());
			e.printStackTrace();
		}*/
		
		MorphologicalAnalyzerRuleReader MAR = new MorphologicalAnalyzerRuleReader();
		MorphologicalAnalyzer ma = new MorphologicalAnalyzer();
		
		System.out.println(new Timestamp(System.currentTimeMillis()).toString());
		//Stemmer stemmer = new Stemmer();
		System.out.println(new Timestamp(System.currentTimeMillis()).toString());
		StemmedToken stemmedToken = stemmer.stem("ही");
		//UTFConsole.out.println(stemmedToken.getStemmedOutputs().get(0).getRoot());
		Iterator<StemmerRuleResult> iterResult = stemmedToken.getStemmedOutputs().iterator();
		while(iterResult.hasNext())
		{
			StemmerRuleResult temp = (StemmerRuleResult)iterResult.next();
			
			if(temp.getSuffixList().indexOf(temp.getUltimateDeletion()) == -1) //{
				temp.getSuffixList().add(temp.getUltimateDeletion());
			    //System.out.println(temp.getUltimateDeletion()) ; }
		}
		MorphologicallyAnalyzedToken mat = ma.analyze(stemmedToken,MAR);
		//System.out.println(mat.getToken());
		if(mat.getAmbiguityScheme().contains("verb")){
			System.out.println("Contains Verb");
		}
		//System.out.println(stemmedToken.getStemmedOutputs().get(0).getSuffixSize());
		//System.out.println(stemmedToken.getStemmedOutputs().get(1).getSuffixSize());
		//System.out.println(new Timestamp(System.currentTimeMillis()).toString());
		stemmedToken = stemmer.stem("अधिकतर");//"काट");//पैर");//माना");//"टहला");		
		System.out.println(stemmedToken.getStemmedOutputs().toString());//get(0).getRoot());
		/*stemmedToken = stemmer.stem("करेगी");
		System.out.println(stemmedToken.getStemmedOutputs().toString());//get(0).getRoot());
		stemmedToken = stemmer.stem("करेंगे");
		System.out.println(stemmedToken.getStemmedOutputs().toString());//get(0).getRoot());
		stemmedToken = stemmer.stem("करेंगी");
		System.out.println(stemmedToken.getStemmedOutputs().toString());//get(0).getRoot());
		stemmedToken = stemmer.stem("करुँगी");
		System.out.println(stemmedToken.getStemmedOutputs().toString());//get(0).getRoot());
		stemmedToken = stemmer.stem("करोगे");
		System.out.println(stemmedToken.getStemmedOutputs().toString());//get(0).getRoot());
		stemmedToken = stemmer.stem("करोगी"); 
		System.out.println(stemmedToken.getStemmedOutputs().toString());//get(0).getRoot());
		stemmedToken = stemmer.stem("करोंगी");
		System.out.println(stemmedToken.getStemmedOutputs().toString());//get(0).getRoot());
		System.out.println(new Timestamp(System.currentTimeMillis()).toString());
		//stemmedToken = stemmer.stem("अधातुओं");
		//UTFConsole.out.println(stemmedToken.getStemmedOutputs().get(0).getRoot());*/
		System.out.println(new Timestamp(System.currentTimeMillis()).toString());
		System.out.println("Done!");
		
		System.out.println(populated);
		
		
	}
}
