package iitb.cfilt.cpost.newstemmer;

import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectInput;
import java.io.ObjectInputStream;
import java.io.ObjectOutput;
import java.io.ObjectOutputStream;
import java.util.*;
import java.io.*;
import java.sql.Timestamp;

import iitb.cfilt.cpost.UTFWriter;
import iitb.cfilt.cpost.ConfigReader;
import iitb.cfilt.cpost.lexicon.WordProperties;
import iitb.cfilt.cpost.lexicon.Wordlist;
//import iitb.cfilt.cpost.logger.Logger;
import iitb.cfilt.cpost.tokenizer.TaggedTokenizer;
import iitb.cfilt.cpost.utils.AccuracyReportWriter;
import iitb.cfilt.cpost.utils.UTFConsole;
import iitb.cfilt.cpost.ma.*;

public class Stemmer {
	
	static boolean mmAtStemmer = true;
	static boolean useStemDictionary = false;
	static boolean populated;
	static boolean useNounList = false;

	public static HashMap<String, StemmedToken> stemDictionary;
	public static StemmerRuleReader stemmerRuleReader;

	
	public static Stemmer stemObj;

	public static Stemmer getInstance(String lang, String config, String baseDir) {
		if (stemObj == null) {
			stemObj = new Stemmer(lang, config, baseDir);
			return stemObj;
		} else {
			return stemObj;
		}
	}
	
	@SuppressWarnings("unchecked")
	public Stemmer(){
	
		if(populated == true)
			return;
		System.out.println("************Resource Reading********");
		boolean fileFound = true;
		try {
			ObjectInput oin = new ObjectInputStream(new FileInputStream(ConfigReader.get("Stemmer.stemDictionary")));
			stemDictionary = (HashMap<String, StemmedToken>) oin.readObject();
		} catch (FileNotFoundException e) {
//			e.printStackTrace();
			fileFound = false;
		} catch (IOException e) {
			e.printStackTrace();
		} catch (ClassNotFoundException e) {
			e.printStackTrace();
		}
		if(!fileFound || !useStemDictionary){
			stemDictionary = new HashMap<String, StemmedToken>();
		}
		Wordlist.populate();
		stemmerRuleReader = new StemmerRuleReader();
		stemmerRuleReader.populate();
		populated = true;
	}

	public Stemmer(String lang, String config, String baseDir) // This version of constructor is used in the CLIA language analyzer
	{
		//if(populated == true)
		//	return;
		//System.out.println("************Resource Reading********");
		ConfigReader.read(config);
		Wordlist.populate(baseDir);
		stemmerRuleReader = new StemmerRuleReader();
		stemmerRuleReader.populate(baseDir);
		useNounList = Boolean.parseBoolean(ConfigReader.get("NewStemmer.useNounList"));
		//populated = true;
	}
	
	static void writeStemDictionary(){
		try {
			ObjectOutput oout = new ObjectOutputStream(new FileOutputStream(ConfigReader.get("Stemmer.stemDictionary")));
			oout.writeObject(stemDictionary);
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
	}

	
	
	public Vector<StemmedToken> stem (Vector<String> tokens){
		Vector<StemmedToken> retVal = new Vector<StemmedToken>(tokens.size());
		for(int i = 0; i < tokens.size(); i++){
			StemmedToken currentStemmedToken = stem(tokens.get(i));
			if(currentStemmedToken != null){
				retVal.add(currentStemmedToken);
			}
		}
		if(retVal.size() == 0){
			retVal = null;
		}
		if(useStemDictionary){
			//writeStemDictionary();
		}
		return retVal;
	}
	
	/**
	 * This function is used to stem the input word. Uses stem1 function internally for all specified spelling variations.
	 * 
	 * @param token The token which is to be stemmed
	 * @return A <code>StemmedToken</code> with multiple stemmer results
	 */
	
	public StemmedToken stem(String token)
	{
		StemmedToken retVal = stem2(token);
		if(retVal.getStemmedOutputs().size()>0 && retVal.getStemmedOutputs().get(0).getCategory().trim().equals("unknown"))
		{
			Vector<String> spellingVariationsVector = stemmerRuleReader.getSpellingVariations(token);
			if(spellingVariationsVector != null)
			{
				for(int i = 0; i < spellingVariationsVector.size(); i++)
				{
					String currentSpellingVariation = spellingVariationsVector.get(i);
					StemmedToken st = stem1(currentSpellingVariation);
					if(!st.getStemmedOutputs().get(0).getCategory().trim().equals("unknown"))
					{
						retVal.resetStemmedOutputs();
						retVal.addAllStemmerOutputs(st.getStemmedOutputs());
					}
				}
			}
		}
		return(retVal);
	}
	
	/**
	 * This function is used to stem the input word
	 * 
	 * @param token The token which is to be stemmed
	 * @return A <code>StemmedToken</code> with multiple stemmer results
	 */
	public StemmedToken stem2(String token){
		StemmedToken retVal = null;
		retVal = new StemmedToken(token);
		
		String temp_token = token;
		boolean flag = false;
		if(token.contains("-") && token.length() > 1) // Checking whether the word has hyphen
		{
			int p = token.indexOf('-');
			if(p != 0)
			{
				token = token.substring(0,p);
				flag = true;
			}
		}
		/* check in lexicon */
		Vector<StemmerRuleResult> lexItems = checkInLexicon(token);
		if(lexItems != null){
			retVal.addAllStemmerOutputs(lexItems);
		}
		//System.out.println("After Lex check : " + retVal.getStemmedOutputs().toString());
		Set<String> categories = stemmerRuleReader.getCategories();		
		Iterator<String> catIter = categories.iterator();
		while(catIter.hasNext())
		{		
			String cat = catIter.next();			
			//System.out.println("For : " + cat);
			/* check for regular suffixes */
			Vector<StemmerRuleResult> srrv = stemmerRuleReader.applySuffixRules(token, cat);
			if(!srrv.isEmpty()){
				for(int i = 0; i<srrv.size(); i++){
					if(!retVal.getStemmedOutputs().contains(srrv.get(i))){
						retVal.addStemmerOutput(srrv.get(i));
					}
				}
			}
			/* check for suffix replacement rules */
			srrv = stemmerRuleReader.applySRR(token, cat);
			if(!srrv.isEmpty()){
				for(int i = 0; i<srrv.size(); i++){
					if(!retVal.getStemmedOutputs().contains(srrv.get(i))){
						retVal.addStemmerOutput(srrv.get(i));
					}
				}
			}
			/*check for irregular verb mapping*/
			srrv = stemmerRuleReader.applyIrregularVerbMapping(token, cat);
			if(!srrv.isEmpty()){
				for(int i = 0; i<srrv.size(); i++){
					if(!retVal.getStemmedOutputs().contains(srrv.get(i))){
						retVal.addStemmerOutput(srrv.get(i));
					}
				}
			}
		}
		if(retVal.getStemmedOutputs().size() == 0)
		{
			if(useNounList == false)
			{
				StemmerRuleResult srr = new StemmerRuleResult(token,"unknown","unknown","","");
				retVal.addStemmerOutput(srr);
			}
			else
			{
				//Trying to remove noun suffixes by guessing the most probable paradigm for it.
				//It is used in the CLIA to reduce the number of unknown words.
				//Currently for POS tagging, this is not used.
				Vector<StemmerRuleResult> srrv = stemmerRuleReader.checkForNounSuffixes(token);
				retVal.addAllStemmerOutputs(srrv);
			}
		}	
		
		retVal.sortResults();
		
		if(flag == true)
		{
			retVal.setToken(temp_token);
		}
		return retVal;
	}
	
	
	public StemmedToken stem1(String token){		
		
		String temp_token = token;
		
		boolean flag = false;
		if(token.contains("-") && token.length() > 1) // Checking whether the word has hyphen
		{
			int p = token.indexOf('-');
			if(p != 0)
			{
				token = token.substring(0,p);
				flag = true;
			}
		}
		StemmedToken retVal = null;
		retVal = new StemmedToken(token);
		
		//Checking whether the word is a special charater or number
		StemmerRuleResult specialCharacterResult = stemmerRuleReader.applySpecialCharacterRulesOn(token);
		if(specialCharacterResult != null){
			retVal.addStemmerOutput(specialCharacterResult);
			return(retVal);
		}
		
		//Checking whether the word is in the lexicon as it is.
		
		Vector<StemmerRuleResult> lexItems = checkInLexicon(token);
		if(lexItems != null){
			retVal.addAllStemmerOutputs(lexItems);
		}
		if(token.equals("वाला") || token.equals("ही"))
			return retVal;
		//System.out.println("After lex checking : " + retVal.getStemmedOutputs().toString());		
		String firstchar = token.substring(0,1);		
		Set<String> categories = stemmerRuleReader.getCategories();		
		Iterator<String> catIter = categories.iterator();
		
		//Following loop checks for each category in turn.		
		while(catIter.hasNext())
		{		
			//System.out.println("Before Phase 2 : " + retVal.getStemmedOutputs().toString());
			String cat = catIter.next();			
			//Checking if type 3 (charam) suffix can be removed.
			Vector<StemmerRuleResult> srrv = stemmerRuleReader.removeCharamSuffix(token,cat);
			if(srrv != null)
			{
				retVal.addAllStemmerOutputs(srrv);
				//System.out.println("Check 1 : " + srrv.toString());
				continue;
			}
			//System.out.println("After Phase 2 : " + retVal.getStemmedOutputs().toString());
			//Apply all possible Suffix Replacement Rules.
			/* Nikhilesh : Here */
			srrv = stemmerRuleReader.applySRRs(token,cat);			
			for(int it=0; it<srrv.size(); it++){
				//System.out.println("checking.. "+ srrv.get(it).toString());
				if(!stemmerRuleReader.valid(srrv.get(it), cat)){
					//System.out.println("Invalid.. "+ srrv.get(it).toString() );
					//srrv.remove(it);
				}
				/*else{
					System.out.println("and found valid ");
				}*/
			}
			if(srrv.size()>0){
				
				//System.out.println("Check 2 : " + srrv.toString());
				
				retVal.addAllStemmerOutputs(srrv);
			}
		}
		
		//System.out.println("After SRRs : " + retVal.getStemmedOutputs().toString());
		//Checking whether the word has some particle attached. If it is so, then repeat the above process by removing the suffix.
		String particleRemovedToken = stemmerRuleReader.removeParticle(token);
		if(!particleRemovedToken.equals(token))
		{
			Vector<StemmerRuleResult> lexItemsnew = checkInLexicon(particleRemovedToken);
			if(lexItemsnew != null){
				for(int z=0;z<lexItemsnew.size();z++)
				{
					//if(lexItemsnew.get(z).getCategory().trim().equals("demonstrative"))
					//	continue;
					retVal.addStemmerOutput(lexItemsnew.get(z));
				}
				//retVal.addAllStemmerOutputs(lexItemsnew);
			}
			catIter = categories.iterator();
			
			while(catIter.hasNext())
			{
				String cat = catIter.next();
				//if(cat.equals("particle") || cat.equals("demonstrative"))
				if(cat.equals("particle"))
					continue;
				Vector<StemmerRuleResult> srrv = stemmerRuleReader.removeCharamSuffix(particleRemovedToken,cat);
				if(srrv != null)
				{
					retVal.addAllStemmerOutputs(srrv);
					continue;
				}
				srrv = stemmerRuleReader.applySRRs(particleRemovedToken,cat);
				if(srrv != null)
					retVal.addAllStemmerOutputs(srrv);
			}
		}
		
				
		if(retVal.getStemmedOutputs().size() == 0)
		{
			if(useNounList == false)
			{
				StemmerRuleResult srr = new StemmerRuleResult(token,"unknown","unknown","","");
				retVal.addStemmerOutput(srr);
			}
			else
			{
				//Trying to remove noun suffixes by guessing the most probable paradigm for it.
				//It is used in the CLIA to reduce the number of unknown words.
				//Currently for POS tagging, this is not used.
				Vector<StemmerRuleResult> srrv = stemmerRuleReader.checkForNounSuffixes(token);
				retVal.addAllStemmerOutputs(srrv);
			}
		}		
		retVal.sortResults();
		if(flag == true)
		{
			retVal.setToken(temp_token);
		}
		
		return retVal;
	}
	
	/**
	 * This function is used to check whether input word is in the lexicon
	 * 
	 * @param token The token which is to be checked
	 * @return A Vector of <code>StemmeerRuleResult</code> is returned depending upon the matches found in the lexicon
	 */
	private Vector<StemmerRuleResult> checkInLexicon(String token) {
		Vector<StemmerRuleResult> retVal = new Vector<StemmerRuleResult>();
		Vector<WordProperties> tokenProperties = Wordlist.searchWordlistFor(token);
		if(tokenProperties != null){
			for(int i = 0; i < tokenProperties.size(); i++){
				WordProperties tokenProperty = tokenProperties.get(i);
//				System.out.println("Found in Lexicon (fullsearch): " + token + " lexiconParadigm : " + tokenProperty.getParadigm() + " Category : " + tokenProperty.getCategory());
				StemmerRuleResult srresult = new StemmerRuleResult(token, tokenProperty.getParadigm(), tokenProperty.getCategory(),"", "");
				if(!retVal.contains(srresult))
					retVal.add(srresult);
			}
		}
		
		//Checking for derivational morphology.
		
		if(retVal.size() == 0){
			Vector<String[]> derivationalMorphologyOutput = stemmerRuleReader.checkDerivationalMorphology(token);
			if(derivationalMorphologyOutput != null){
				for(int i = 0; i < derivationalMorphologyOutput.size(); i++){
					String[] properties = derivationalMorphologyOutput.get(i);
					String derivedRoot = properties[0];
					String derivedRootCategory = properties[1];
					String derivedNewCategory = properties[2];

					Vector<WordProperties> derivedRootWordProperties = Wordlist.searchWordlistFor(derivedRoot);
					if(derivedRootWordProperties != null){
						for(int j = 0; j < derivedRootWordProperties.size(); j++){
							WordProperties dwp = derivedRootWordProperties.get(j);
							if(dwp.getCategory().equals(derivedRootCategory)){
								StemmerRuleResult srresult = new StemmerRuleResult(derivedRoot, "", derivedNewCategory,"","");
								if(!retVal.contains(srresult))
									retVal.add(srresult);
							}
						}
					}
				}
			}

		}
		if(retVal.size() == 0){
			retVal = null;
		}
		return retVal;
	}

	/**
	 * @param args
	 */
	public static void main(String args[]){
		
		ConfigReader.read(args[0]);
		Stemmer stemmer = new Stemmer();
		
		MorphologicalAnalyzerRuleReader MAR = new MorphologicalAnalyzerRuleReader();
		MorphologicalAnalyzer ma = new MorphologicalAnalyzer();
		
		//System.out.println(new Timestamp(System.currentTimeMillis()).toString());
		//Stemmer stemmer = new Stemmer();
		//System.out.println(new Timestamp(System.currentTimeMillis()).toString());
		//StemmedToken stemmedToken = stemmer.stem("ही");
		//UTFConsole.out.println(stemmedToken.getStemmedOutputs().get(0).getRoot());
		//Iterator<StemmerRuleResult> iterResult = stemmedToken.getStemmedOutputs().iterator();
		String sentence = "चाहिए";//जमा हुए";//जोड़ी";//पड़";//पड़";//पड़ा";//आगे";//ठहराया";//दिया ही";//होने देंगे";//जोड़ रखा";//गया की";//पता की";//बोडो दी जाएगी";//बनाया हुआ";//कर लिया";//गया किया";//जा लीगा";//हुए";//का";//पांचवां";//राजाओं";//अच्छी";//खेती";//रहे";//बना";//कर";//दे";//हो मिलने";//लिया";//वाले";//दे";//बना";//साल";//होंगी";//भी";//तो";//ही";//हुई";//थी";//सी";//सकतो";//सकेगा";//सकती";//सकता";//सका";//सकोगा";//सकें";//ठुकरा";//दिया";//हुईं";//आईं";//आगे";//लगती";//गई";//तैराकी";//जोड़ी";//दीजिए";//ली";//वाले";//दिन";//वाली";//सवार";//अधिकतर";//ही";//वाला";// चुका हुई दीया दया";
		if(sentence.contains("िया")){
			System.out.println("working..");
		}
		String[] words = sentence.split(" ");
		for(int i=0; i<words.length; i++){
			//stemmedToken = stemmer.stem("गिने");//पा");//खिलाया");//उठा");//आते");//पदता");//पकड़");//लिया");//लीजिये");//सूखी");//हुई");//बैठी");//दीया");//दया");//क्यों");//फूले");//जा");//सी");//अधिकांश");//चलते");//जलते");//जलाया");//कहीं");//सुनने");//गया");//भाई-बहन");//कोलकाता");//घटना");//खेती");//दा");//की");//गई");//थम");//जोड़ी");//लेकर");//लीगा");//अधिकतर");//"काट");//पैर");//माना");//"टहला");
			StemmedToken stemmedToken = stemmer.stem(words[i]);
			System.out.println(stemmedToken.getStemmedOutputs().toString());//get(0).getRoot());
			MorphologicallyAnalyzedToken mat = ma.analyze(stemmedToken,MAR);
			System.out.println(mat.getAmbiguityScheme());
		}
		
	}
}
