/**
 * Project: GNormPlus
 * Function: Gene Normalization
 */

package GNormPluslib;

import bioc.BioCAnnotation;
import bioc.BioCCollection;
import bioc.BioCDocument;
import bioc.BioCLocation;
import bioc.BioCPassage;

import bioc.io.BioCDocumentWriter;
import bioc.io.BioCFactory;
import bioc.io.woodstox.ConnectorWoodstox;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.text.BreakIterator;
import java.time.LocalDate;
import java.time.ZoneId;
import java.text.DecimalFormat;
import java.math.RoundingMode;

import javax.xml.stream.XMLStreamException;

import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;

public class GN 
{
	public static HashMap<String, String> MatchedTokens_hash = new HashMap<String, String>();
	private double ScoringFunction(String geneid,HashMap<String,String> Mention_hash,String LF)
	{
		/*
		 * define gene/homo id
		 */
		
		//LF
		LF = LF.toLowerCase();
		LF = LF.replaceAll("([0-9])([a-z])", "$1 $2");
		LF = LF.replaceAll("([a-z])([0-9])", "$1 $2");
		LF = LF.replaceAll("([\\W\\-\\_])", " ");
		LF = LF.replaceAll("[ ]+", " ");
		String LF_tkn[]=LF.split(" ");
		int LF_ParticalMatch = 0;
		
		Pattern ptmp = Pattern.compile("[0-9]+\\-([0-9]+)");
		Matcher mtmp = ptmp.matcher(geneid);
		Pattern ptmp2 = Pattern.compile("([0-9]+)");
		Matcher mtmp2 = ptmp.matcher(geneid);
		if(mtmp.find())
		{
			geneid = "Homo:"+mtmp.group(1);
		}
		else
		{
			geneid = "Gene:"+geneid;
		}
		
		if(GNormPlus.GeneScoring_hash.containsKey(geneid))
		{
			HashMap<String,Double> TF = new HashMap<String,Double>(); // token i in gene j
			HashMap<String,Double> TermFrequency = new HashMap<String,Double>();
			
			/*
			 * Tokens in Query (Gene id lexicon)
			 */
			String l[]=GNormPlus.GeneScoring_hash.get(geneid).split("\t"); // Gene:2664293	cmk-1,cytidylate-1,kinase-1,mssa-1	0.4096	4	0.0625	1	2.0
			String tkns_Gene[] = l[0].split(",");
			for(int i=0;i<tkns_Gene.length;i++)
			{
				String Tkn_Freq[] = tkns_Gene[i].split("-");
				TermFrequency.put(Tkn_Freq[0], Double.parseDouble(Tkn_Freq[1]));
			}
			Double Cj =  Double.parseDouble(l[1]);
			Double AllTknNum = Double.parseDouble(l[2]);
			//Double Cj_max =  Double.parseDouble(l[3]);
			//Double MaxTknNum = Double.parseDouble(l[4]);
			Double Norm = Double.parseDouble(l[5]);
			if(Norm == 0.0){Norm=1.0;}
			
			/*
			 * Tokens in Document (recognized mentions)
			 */
			for(String Mention : Mention_hash.keySet())
			{
				Mention = Mention.toLowerCase();
				Mention = Mention.replaceAll("([0-9])([a-z])", "$1 $2");
				Mention = Mention.replaceAll("([a-z])([0-9])", "$1 $2");
				Mention = Mention.replaceAll("([\\W\\-\\_])", " ");
				Mention = Mention.replaceAll("[ ]+", " ");
				String tkns_Mention[]=Mention.split(" ");
				for(int i=0;i<tkns_Mention.length;i++)
				{
					if(TermFrequency.containsKey(tkns_Mention[i]))
					{
						TF.put(tkns_Mention[i], TermFrequency.get(tkns_Mention[i]));
					}
				}
			}
			
			Double score=0.0;
			for(String Tkn : TF.keySet())
			{
				//LF
				for(int t=0;t<LF_tkn.length;t++)
				{
					if(LF_tkn[t].equals(Tkn))
					{
						LF_ParticalMatch++;
					}
				}
				
				double TFij = TF.get(Tkn)/AllTknNum;
				double IDFi=GNormPlus.GeneScoringDF_hash.get(Tkn);
				score=score+TFij*IDFi*(1/(1-TFij));
			}
			//score = Cj * (1/Norm) *score;
			if(LF_ParticalMatch>0){score = score + LF_ParticalMatch;/*System.out.println(geneid+"\t"+LF+"\t"+score);*/}
			return score;
		}
		else
		{
			//System.out.println("Error: cannot find geneid: "+geneid+" in GeneScoring_hash");
			return 0.0;
		}
	}
	
	public void PreProcessing4GN(String Filename,String FilenameBioC) throws IOException, XMLStreamException
	{
		for (int i = 0; i < GNormPlus.BioCDocobj.Annotations.size(); i++) 
		{
			for (int j = 0; j < GNormPlus.BioCDocobj.Annotations.get(i).size(); j++) 
			{
				for (int k = 0; k < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); k++)
				{
					String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
    				String start=anno[0];
					String last=anno[1];
					String mentions=anno[2];
					String type=anno[3];
					String id="";
					if(anno.length>=5)
					{
						id=anno[4];	
					}
					
					if(type.equals("Gene"))
					{
						String mentionArr[] = mentions.split("\\|");
						boolean update=false;
						for(int m=0;m<mentionArr.length;m++)
						{
							Pattern ptmp = Pattern.compile("^(.*[0-9A-Z])[ ]*p$");
							Matcher mtmp = ptmp.matcher(mentionArr[m]);
							Pattern ptmp2 = Pattern.compile("^(.+)nu$");
							Matcher mtmp2 = ptmp2.matcher(mentionArr[m]);
							Pattern ptmp3 = Pattern.compile("^(.*)alpha(.*)$");
							Matcher mtmp3 = ptmp3.matcher(mentionArr[m]);
							Pattern ptmp4 = Pattern.compile("^(.*)beta(.*)$");
							Matcher mtmp4 = ptmp4.matcher(mentionArr[m]);
							Pattern ptmp5 = Pattern.compile("^(.+[0-9])a$");
							Matcher mtmp5 = ptmp5.matcher(mentionArr[m]);
							Pattern ptmp6 = Pattern.compile("^(.+[0-9])b$");
							Matcher mtmp6 = ptmp6.matcher(mentionArr[m]);
							Pattern ptmp7 = Pattern.compile("^(.+)II([a-z])$");
							Matcher mtmp7 = ptmp7.matcher(mentionArr[m]);
							Pattern ptmp8 = Pattern.compile("^(.+)III([a-z])$");
							Matcher mtmp8 = ptmp8.matcher(mentionArr[m]);
							if(mtmp.find())
							{
								mentions=mentions+"|"+mtmp.group(1);
								update=true;
							}
							if(mtmp2.find())
							{
								mentions=mentions+"|"+mtmp2.group(1);
								update=true;
							}
							if(mtmp3.find())
							{
								mentions=mentions+"|"+mtmp3.group(1)+"a"+mtmp3.group(2);
								update=true;
							}
							if(mtmp4.find())
							{
								mentions=mentions+"|"+mtmp4.group(1)+"b"+mtmp4.group(2);
								update=true;
							}
							if(mtmp5.find())
							{
								mentions=mentions+"|"+mtmp5.group(1)+"alpha";
								update=true;
							}
							if(mtmp6.find())
							{
								mentions=mentions+"|"+mtmp6.group(1)+"beta";
								update=true;
							}
							if(mtmp7.find())
							{
								mentions=mentions+"|"+mtmp7.group(1)+"2"+mtmp7.group(2);
								update=true;
							}
							if(mtmp8.find())
							{
								mentions=mentions+"|"+mtmp8.group(1)+"3"+mtmp8.group(2);
								update=true;
							}
						}
						if(update == true)
						{
							GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, start + "\t" + last + "\t" + mentions + "\t" + type + "\t" + id );
						}
					}
				}
			}
		}
		//GNormPlus.BioCDocobj.BioCOutput(Filename,FilenameBioC,GNormPlus.BioCDocobj.Annotations,false,true);
	}
	
	public void ChromosomeRecognition(String Filename,String FilenameBioC) throws IOException, XMLStreamException
	{
		for (int i = 0; i < GNormPlus.BioCDocobj.PMIDs.size(); i++) /** PMIDs : i */
		{
			String Pmid = GNormPlus.BioCDocobj.PMIDs.get(i);
			for (int j = 0; j < GNormPlus.BioCDocobj.PassageNames.get(i).size(); j++) /** Paragraphs : j */
			{
				String PassageContext = GNormPlus.BioCDocobj.PassageContexts.get(i).get(j); // Passage context
				
				/** Chromosome recognition */
				ArrayList<String> locations = GNormPlus.PT_GeneChromosome.SearchMentionLocation(PassageContext,"ChromosomeLocation");
				for (int k = 0 ; k < locations.size() ; k++)
				{
					String anno[]=locations.get(k).split("\t");
					//int start= Integer.parseInt(anno[0]);
	        		//int last= Integer.parseInt(anno[1]);
	        		//String mention = anno[2];
	        		String ids = anno[3];
	        		//GNormPlus.BioCDocobj.Annotations.get(i).get(j).add(start+"\t"+last+"\t"+mention+"\tChromosomeLocation\t"+ids); //paragraph
	        		String IDs[] = ids.split("[\\|,]");
	        		for(int idcount=0;idcount<IDs.length;idcount++)
	        		{
	        			//IDs[idcount] = IDs[idcount].replaceAll("\\-[0-9]+", "");
	        			GNormPlus.Pmid2ChromosomeGene_hash.put(Pmid+"\t"+IDs[idcount],"");
	        		}
				}
			}
		}
		//GNormPlus.BioCDocobj.BioCOutput(Filename,FilenameBioC,GNormPlus.BioCDocobj.Annotations,false,true);
	}
	
	public void GeneNormalization(String Filename,String FilenameBioC,boolean GeneIDMatch) throws IOException, XMLStreamException
	{
		final DecimalFormat df = new DecimalFormat("0.####");
        df.setRoundingMode(RoundingMode.HALF_UP);
		
		//Tokenization
		for (int i = 0; i < GNormPlus.BioCDocobj.Annotations.size(); i++) /** PMIDs : i */
		{
			String Pmid = GNormPlus.BioCDocobj.PMIDs.get(i);
			
			/** Species */
			HashMap<String,String> Species_hash = new HashMap<String,String>();
			for (int j = 0; j < GNormPlus.BioCDocobj.Annotations.get(i).size(); j++) /** Paragraphs : j */
			{
				for (int k = 0; k < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); k++) /** Annotation : k */
				{
					String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
    				String mentions=anno[2];
					String type=anno[3];
					if(type.matches("(Species|Genus|Strain|CellLine|Cell)"))
					{
						Species_hash.put(mentions,"");
					}
				}
			}
			
			
			/*
			 * Collect Gene mentions :
			 * 
			 *  GeneMention-taxid	->	"ID" : geneid
			 *  					->	"type" : "Gene"
			 *  					->	start1-last1 : ""
			 *  					->	start2-last2 : ""
			 *  					->	start3-last3 : ""
			 */

			String tiabs="";
			for (int j = 0; j < GNormPlus.BioCDocobj.PassageContexts.get(i).size(); j++) /** Paragraphs : j */
			{
				tiabs=tiabs+GNormPlus.BioCDocobj.PassageContexts.get(i).get(j).toLowerCase();
			}
			HashMap<String,HashMap<String,String>> GeneMention_hash = new HashMap<String,HashMap<String,String>>();
			HashMap<String,String> Mention_hash = new HashMap<String,String>();
			for (int j = 0; j < GNormPlus.BioCDocobj.Annotations.get(i).size(); j++) /** Paragraphs : j */
			{
				for (int k = 0; k < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); k++) /** Annotation : k */
				{
					String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
    				String start=anno[0];
					String last=anno[1];
					String mentions=anno[2];
					String type=anno[3];
					String taxids="Tax:9606";
					
					if(anno.length>=5)
					{
						taxids=anno[4];
					}
					String mentions_tmp=mentions.toLowerCase();
					mentions_tmp=mentions_tmp.replaceAll("[\\W\\-\\_]","");
					mentions_tmp=mentions_tmp.replaceAll("[0-9]","0");
					taxids=taxids.replaceAll("(Focus|Right|Left|Prefix|Tax):","");
					if(taxids.equals(""))
					{
						taxids="9606";
					}
					/** Filtering */
					boolean found_filter = false;
					if(GNormPlus.Filtering_hash.containsKey(mentions_tmp)) // filtering
					{
						found_filter=true;
					}
					
					if(found_filter==false) //abbreviation
					{
						for(String f : GNormPlus.Filtering_WithLongForm_hash.keySet())
						{
							if( GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).matches(".*[\\t\\|]"+f+"\tGene.*") ||
								GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).matches(".*\\t"+f+"\\|[^\t]+\tGene.*")
									)
							{
								String lf=GNormPlus.Filtering_WithLongForm_hash.get(f);
								if(tiabs.matches(".*"+lf+".*"))
								{
									found_filter=true;
									break;
								}
							}
						}
					}
					
					if(found_filter==false)
					{
						if( GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).matches(".*[\\t\\|][a-z]\tGene.*") ||
								GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).matches(".*\\t[a-z]\\|[^\t]+\tGene.*") //32171191	Wuhan's
									)
						{
							found_filter=true;
			 
						}
					}
					
					if(found_filter == false)
					{
						if(type.matches("Gene"))
	    				{
							if(GeneMention_hash.containsKey(mentions+"\t"+taxids))
	    					{
	    						GeneMention_hash.get(mentions+"\t"+taxids).put(start+"\t"+last,"");
	    					}
	    					else 
	    					{
	    						HashMap<String,String> offset_hash = new HashMap<String,String>();
	    						offset_hash.put(start+"\t"+last,"");
	    						GeneMention_hash.put(mentions+"\t"+taxids, offset_hash);
	    						GeneMention_hash.get(mentions+"\t"+taxids).put("type", type);
	    						Mention_hash.put(mentions,"Gene");
	    					}
	    				}
	    				else if(type.matches("(FamilyName|DomainMotif)"))
	    				{
	    					String GMs[]=mentions.split("\\|");
	    					for(int g=0;g<GMs.length;g++)
							{
								String mention = GMs[g];
								Mention_hash.put(mention,"FamilyDomain");
							}
	    				}
					}
					
				}
			}
			
			/*
			 * Gene id refinement:
			 *  1. Official name
			 *  2. only one gene
			 */
			HashMap<String,String> GuaranteedGene2ID = new HashMap<String,String>();
			HashMap<String,String> MultiGene2ID = new HashMap<String,String>();
			for(String GeneMentionTax : GeneMention_hash.keySet())
			{
				String GT[]=GeneMentionTax.split("\\t");
				String mentions=GT[0];
				String taxids=GT[1];
				String GMs[]=mentions.split("\\|");
				
				HashMap<String,String> taxids_hash = new HashMap<String,String>();
				String taxids_arr[]=taxids.split(",");
				for(int t=0;t<taxids_arr.length;t++)
				{
					taxids_hash.put(taxids_arr[t], "");
				}
				
				for(int ms=0;ms<GMs.length;ms++)
				{
					String mention = GMs[ms];
					String IDstr = GNormPlus.PT_Gene.MentionMatch(mention); /** searched by PT_Gene */
					String IDs[]=IDstr.split("\\|");
					
					/*
					 * printing the ambiguous gene mentions and candidates
					 */
					//String IDs_s[]=IDstr.split(",");
					//if(IDs_s.length>1)
					//{
					//	System.out.println(Pmid+"\t"+mention+"\t"+mentions+"\t"+IDstr);
					//}
					
					for(int c=0;c<IDs.length;c++)
					{
						String tax2ID[]=IDs[c].split(":"); // tax2ID[0] = taxid ; tax2ID[1] = geneids
						if(taxids_hash.containsKey(tax2ID[0]))
						{
							String geneid=tax2ID[1];
							String TargetTax=tax2ID[0];
							GeneMention_hash.get(GeneMentionTax).put("ID", geneid);
							GeneMention_hash.get(GeneMentionTax).put("TargetTax", TargetTax);
							break;
						}
					}
					
					//geneid refinement
					if(GeneMention_hash.get(GeneMentionTax).containsKey("ID"))
					{
						Pattern ptmp = Pattern.compile("\\*([0-9]+(\\-[0-9]+|))");
						Matcher mtmp = ptmp.matcher(GeneMention_hash.get(GeneMentionTax).get("ID"));
						
						if(mtmp.find()) // 1. Official Name
						{
							GeneMention_hash.get(GeneMentionTax).put("ID",mtmp.group(1));
							GuaranteedGene2ID.put(GeneMentionTax,mtmp.group(1));
						}
						else if(GeneMention_hash.get(GeneMentionTax).get("ID").matches("[0-9]+(\\-[0-9]+|)")) // 2. only one gene
						{
							GuaranteedGene2ID.put(GeneMentionTax,GeneMention_hash.get(GeneMentionTax).get("ID"));
						}
						else
						{
							String ID[] = GeneMention_hash.get(GeneMentionTax).get("ID").split(",");
							boolean FoundByChroLoca=false;
							for(int idcount=0;idcount<ID.length;idcount++)
							{
								if(GNormPlus.Pmid2ChromosomeGene_hash.containsKey(Pmid+"\t"+ID[idcount])) // 3. Chromosome location
								{
									GuaranteedGene2ID.put(GeneMentionTax,ID[idcount]);
									FoundByChroLoca=true;
									break;
								}
							}
							if(FoundByChroLoca == false)
							{
								MultiGene2ID.put(GeneMentionTax, GeneMention_hash.get(GeneMentionTax).get("ID"));
							}
						}
					}
					if(GNormPlus.suffixprefix_orig2modified.containsKey(mention) && (!IDstr.equals("-1")) && (!IDstr.equals("-2")) && (!IDstr.equals("-3")))
					{
						break;
					}
				}
			}
			
			/*
			 * Gene id refinement:
			 *  3. multiple genes but can be inferred by 1. and 2.
			 */
			for(String GeneMentionTax_M : MultiGene2ID.keySet())
			{
				for(String GeneMentionTax_G : GuaranteedGene2ID.keySet())
				{
					String MG[] = MultiGene2ID.get(GeneMentionTax_M).split(",");
					for(int m=0;m<MG.length;m++)
					{
						if(MG[m].equals(GuaranteedGene2ID.get(GeneMentionTax_G)))
						{
							GeneMention_hash.get(GeneMentionTax_M).put("ID",MG[m]);
						}
					}
				}
			}
			
			/*
			 * Gene id refinement:
			 *  4. FullName -> Abbreviation
			 */
			for(String GeneMentionTax : GeneMention_hash.keySet())
			{
				String MT[] = GeneMentionTax.split("\\t");
				if(GNormPlus.PmidLF2Abb_hash.containsKey(Pmid+"\t"+MT[0]))
				{
					String GeneMentionTax_Abb = GNormPlus.PmidLF2Abb_hash.get(Pmid+"\t"+MT[0]) + "\t" + MT[1];
					if(GeneMention_hash.containsKey(GeneMentionTax_Abb) && GeneMention_hash.get(GeneMentionTax).containsKey("ID"))
					{
						GeneMention_hash.get(GeneMentionTax_Abb).put("ID", GeneMention_hash.get(GeneMentionTax).get("ID"));
					}
				}
			}
			
			/*
			 * Gene id refinement:
			 *  5. Ranking by scoring function (inference network)
			 */
			for(String GeneMentionTax : GeneMention_hash.keySet())
			{
				if(GeneMention_hash.get(GeneMentionTax).containsKey("ID") && GeneMention_hash.get(GeneMentionTax).get("ID").matches(".+,.+"))
				{
					String geneids=GeneMention_hash.get(GeneMentionTax).get("ID");
					String geneid[] = geneids.split(",");
					
					String OutputStyle="Top1";
					if(OutputStyle.equals("Top1"))
					{
						//only return the best one
						double max_score=0.0;
						String target_geneid="";
						for(int g=0;g<geneid.length;g++)
						{
							String MT[] = GeneMentionTax.split("\\t");
							String LF="";
							if(GNormPlus.PmidAbb2LF_hash.containsKey(Pmid+"\t"+MT[0]))
							{
								LF = GNormPlus.PmidAbb2LF_hash.get(Pmid+"\t"+MT[0]);
							}
							double score = ScoringFunction(geneid[g],Mention_hash,LF);
							if(score>max_score)
							{
								max_score=score;
								target_geneid=geneid[g];
							}
							else if(score == 0.0)
							{
								//System.out.println(GeneMentionTax);
							}
						}
						GeneMention_hash.get(GeneMentionTax).put("ID", target_geneid);
					}
					else // "All"
					{
						//return all geneids
						String geneSTR="";
						for(int g=0;g<geneid.length;g++)
						{
							String MT[] = GeneMentionTax.split("\\t");
							String LF="";
							if(GNormPlus.PmidAbb2LF_hash.containsKey(Pmid+"\t"+MT[0]))
							{
								LF = GNormPlus.PmidAbb2LF_hash.get(Pmid+"\t"+MT[0]);
							}
							double score = ScoringFunction(geneid[g],Mention_hash,LF);
							String hoge = df.format(score);
							score=Double.parseDouble(hoge);
							
							if(geneSTR.equals(""))
							{
								geneSTR=geneid[g]+"-"+score;
							}
							else
							{
								geneSTR=geneSTR+","+geneid[g]+"-"+score;
							}
						}
						GeneMention_hash.get(GeneMentionTax).put("ID", geneSTR);
					}
				}
			}
			
			/*
			 * Gene id refinement: - removed (Reason: cause too much False Positive)
			 *  6. Abbreviation -> FullName
			 *  
			 */
			for(String GeneMentionTax : GeneMention_hash.keySet())
			{
				String MT[] = GeneMentionTax.split("\\t");
				if(GNormPlus.PmidAbb2LF_hash.containsKey(Pmid+"\t"+MT[0]))
				{
					String GeneMentionTax_LF = GNormPlus.PmidAbb2LF_hash.get(Pmid+"\t"+MT[0]) + "\t" + MT[1];
					if(GeneMention_hash.containsKey(GeneMentionTax_LF) && GeneMention_hash.get(GeneMentionTax).containsKey("ID"))
					{
						GeneMention_hash.get(GeneMentionTax_LF).put("ID", GeneMention_hash.get(GeneMentionTax).get("ID"));
					}
				}
			}
			
			/*
			 * Gene id refinement:
			 *  7. The inference network tokens of Abbreviation.ID should contain at least LF tokens
			 *  8. The short mention should be filtered if not long form support
			 */
			ArrayList<String> removeGMT = new ArrayList<String>();
			for(String GeneMentionTax : GeneMention_hash.keySet())
			{
				String GT[]=GeneMentionTax.split("\\t");
				String mentions=GT[0];
				String tax=GT[1];
				if(GeneMention_hash.get(GeneMentionTax).containsKey("type") && GeneMention_hash.get(GeneMentionTax).get("type").equals("Gene") && GeneMention_hash.get(GeneMentionTax).containsKey("ID"))
				{
					String type = GeneMention_hash.get(GeneMentionTax).get("type");
					String id = GeneMention_hash.get(GeneMentionTax).get("ID");
					String geneid="";
					Pattern ptmp1 = Pattern.compile("^([0-9]+)\\-([0-9]+)$");
					Pattern ptmp2 = Pattern.compile("^([0-9]+)$");
					Matcher mtmp1 = ptmp1.matcher(id);
					Matcher mtmp2 = ptmp2.matcher(id);
					//System.out.println(id);
					if(mtmp1.find())
					{
						geneid = "Homo:"+mtmp1.group(2);
					}
					else if(mtmp2.find())
					{
						geneid = "Gene:"+mtmp2.group(1);
					}
					
					boolean LongFormTknMatch= false;
					boolean LongFormExist= true;
					if(GNormPlus.GeneScoring_hash.containsKey(geneid))
					{
						if(GNormPlus.PmidAbb2LF_lc_hash.containsKey(Pmid+"\t"+mentions.toLowerCase()))
						{
							/*
							 * token in lexicon : tkn_lexicon
							 * token in mention : tkn_mention
							 */
							String l[]=GNormPlus.GeneScoring_hash.get(geneid).split("\t"); // Gene:2664293	cmk-1,cytidylate-1,kinase-1,mssa-1	0.4096	4	0.0625	1	2.0
							String tkns_Gene[] = l[0].split(",");
							ArrayList<String> tkn_lexicon = new ArrayList<String>();
							for(int ti=0;ti<tkns_Gene.length;ti++)
							{
								String Tkn_Freq[] = tkns_Gene[ti].split("-");
								tkn_lexicon.add(Tkn_Freq[0]);
							}
							
							String LF_lc=GNormPlus.PmidAbb2LF_lc_hash.get(Pmid+"\t"+mentions.toLowerCase());
							LF_lc = LF_lc.replaceAll("([0-9])([A-Za-z])", "$1 $2");
							LF_lc = LF_lc.replaceAll("([A-Za-z])([0-9])", "$1 $2");
							String tkn_mention[] = LF_lc.split("[\\W\\-\\_]");
							for(int tl=0;tl<tkn_lexicon.size();tl++)
							{
								for(int tm=0;tm<tkn_mention.length;tm++)
								{
									if(tkn_lexicon.get(tl).equals(tkn_mention[tm]) && (!tkn_mention[tm].matches("[0-9]+")))
									{
										LongFormTknMatch = true;
									}
								}	
							}
						}
						else{LongFormExist = false;}
					}
					else{LongFormTknMatch = true;} // exception
					
					if(LongFormTknMatch == false && LongFormExist == true) // 7.
					{
						removeGMT.add(GeneMentionTax); //remove short form
						removeGMT.add(GNormPlus.PmidAbb2LF_hash.get(Pmid+"\t"+mentions)+"\t"+tax); //remove long form
					}
					else if(mentions.length()<=2 && LongFormExist == false) // 8.
					{
						removeGMT.add(GeneMentionTax);
					}
				}
			}
			
			for(int gmti=0;gmti<removeGMT.size();gmti++) // remove
			{
				GeneMention_hash.remove(removeGMT.get(gmti));
			}
						
			// Append gene ids
			for (int j = 0; j < GNormPlus.BioCDocobj.Annotations.get(i).size(); j++) // Paragraphs : j
			{
				for (int k = 0; k < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); k++) // Annotation : k
				{
					String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
    				String start=anno[0];
					String last=anno[1];
					String mentions=anno[2];
					String type=anno[3];
					String taxid_org="Tax:9606";
					if(anno.length>=5)
					{
						taxid_org=anno[4];
					}
					String taxids=taxid_org.replaceAll("(Focus|Right|Left|Prefix|Tax):","");
					String GMs[]=mentions.split("\\|");
					
					if(GeneMention_hash.containsKey(mentions+"\t"+taxids) && GeneMention_hash.get(mentions+"\t"+taxids).containsKey("TargetTax"))
					{
						String taxtype=taxid_org.replaceAll(":([0-9,]+)","");
						String taxid=GeneMention_hash.get(mentions+"\t"+taxids).get("TargetTax");
						GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, start+"\t"+last+"\t"+mentions+"\t"+type+"\t"+taxtype+":"+taxid);
					}
					
					if(type.equals("Gene"))
					{
						GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k) + "|");
						
					
						if(GeneMention_hash.containsKey(mentions+"\t"+taxids) && GeneMention_hash.get(mentions+"\t"+taxids).containsKey("ID"))
						{
							GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k) + GeneMention_hash.get(mentions+"\t"+taxids).get("ID") + "," );
						}
						else // cannot find appropriate species
						{
							//System.out.println(mention+"\t"+taxid);
						}
						GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).substring(0, GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).length()-1)); // remove ",$"
					}
				}
			}
			
			//Extend to all gene mentions
			HashMap<String,String> GeneMentions = new HashMap<String,String>(); // Extending Gene mentions
			HashMap<String,String> GeneMentionLocation = new HashMap<String,String>(); // Extending Gene mentions
			for(int j=0;j<GNormPlus.BioCDocobj.Annotations.get(i).size();j++) // Paragraph
			{
				for (int k = 0; k < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); k++) // Annotation : k
				{
					String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
					int start = Integer.parseInt(anno[0]);
					int last = Integer.parseInt(anno[1]);
					String mentions=anno[2];
					String type=anno[3];
					String id="Tax:9606";
					if(anno.length>=5)
					{
						id=anno[4];
					}
					if(type.equals("Gene") && id.matches("(Focus|Right|Left|Prefix|Tax)\\:([0-9]+)\\|([0-9]+)\\-([0-9]+)"))
					{
						GeneMentions.put(mentions.toLowerCase(), id);
						for (int s=start ;s<=last;s++)
						{
							GeneMentionLocation.put(j+"\t"+s,"");
						}
					}
					else if(type.equals("Gene") && id.matches("(Focus|Right|Left|Prefix|Tax)\\:([0-9]+)\\|([0-9]+)"))
					{
						GeneMentions.put(mentions.toLowerCase(), id);
						for (int s=start ;s<=last;s++)
						{
							GeneMentionLocation.put(j+"\t"+s,"");
						}
					}
				}
			}
			for(int j=0;j<GNormPlus.BioCDocobj.Annotations.get(i).size();j++) // Paragraph
			{
				if(GNormPlus.BioCDocobj.PassageContexts.size()>i && GNormPlus.BioCDocobj.PassageContexts.get(i).size()>j)
				{
					String PassageContexts = " " + GNormPlus.BioCDocobj.PassageContexts.get(i).get(j) + " ";
					String PassageContexts_tmp = PassageContexts.toLowerCase();
					for(String gm : GeneMentions.keySet())
					{
						String id = GeneMentions.get(gm);
						if(gm.length()>=3)
						{
							gm = gm.replaceAll("[ ]*[\\|]*$", "");
							gm = gm.replaceAll("^[\\|]*[ ]*", "");
							gm = gm.replaceAll("[\\|][\\|]+", "\\|");
							if(!gm.matches("[\\W\\-\\_]*"))
							{
								gm = gm.replaceAll("([^A-Za-z0-9\\| ])", "\\\\$1");
								Pattern ptmp = Pattern.compile("^(.*[\\W\\-\\_])("+gm+")([\\W\\-\\_].*)$");
								Matcher mtmp = ptmp.matcher(PassageContexts_tmp);
								while(mtmp.find())
								{
									String pre = mtmp.group(1);
									String gmtmp = mtmp.group(2);
									String post = mtmp.group(3);
			
									int start = pre.length()-1;
									int last = start+gmtmp.length();
									if(PassageContexts.length()>=last+1)
									{
										String mention = PassageContexts.substring(start+1,last+1);
										if(!GeneMentionLocation.containsKey(j+"\t"+start) && !GeneMentionLocation.containsKey(j+"\t"+last))
										{
											GNormPlus.BioCDocobj.Annotations.get(i).get(j).add(start+"\t"+last+"\t"+mention+"\tGene\t"+id);
										}
									}
									gmtmp = gmtmp.replaceAll(".", "\\@");
									PassageContexts_tmp=pre+""+gmtmp+""+post;
									mtmp = ptmp.matcher(PassageContexts_tmp);
								}
							}
						}
					}
				}
			}
			
			//Apply to FamilyNames
			HashMap<String,String> geneids = new HashMap<String,String>(); // Extending Gene mentions
			for(int j=0;j<GNormPlus.BioCDocobj.Annotations.get(i).size();j++) // Paragraph
			{
				for (int k = 0; k < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); k++) // Annotation : k
				{
					String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
					String type=anno[3];
					if(type.equals("Gene"))
					{
						String id="Tax:9606";
						if(anno.length>=5)
						{
							id=anno[4];
						}
						Pattern ptmp0 = Pattern.compile("^(Focus|Right|Left|Prefix|GeneID|Tax)\\:([0-9]+)\\|([0-9]+)$");
						Matcher mtmp0 = ptmp0.matcher(id);
						Pattern ptmp1 = Pattern.compile("^(Focus|Right|Left|Prefix|GeneID|Tax)\\:([0-9]+)\\|([0-9]+)\\-([0-9]+)$");
						Matcher mtmp1 = ptmp1.matcher(id);
						if(mtmp0.find())
						{
							geneids.put(mtmp0.group(3), "");
						}
						if(mtmp1.find())
						{
							geneids.put(mtmp1.group(3), "");
						}
					}
				}
			}
			for(int j=0;j<GNormPlus.BioCDocobj.Annotations.get(i).size();j++) // Paragraph
			{
				for (int k = GNormPlus.BioCDocobj.Annotations.get(i).get(j).size()-1; k >=0 ; k--) // Annotation : k
				{
					String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
					String mention=anno[2];
					String type=anno[3];
					if(type.matches("(FamilyName|DomainMotif)"))
					{
						String id="Tax:9606";
						if(anno.length>=5)
						{
							id=anno[4];
						}
						String IDstrs = GNormPlus.PT_FamilyName.MentionMatch(mention);
						String IDstr[]=IDstrs.split("\\|");
						String ids="";
						for(int id_i=0;id_i<IDstr.length;id_i++)
						{
							if(geneids.containsKey(IDstr[id_i]))
							{
								 if(ids.equals(""))
								 {
									 ids=IDstr[id_i];
								 }
								 else
								 {
									 ids=ids+";"+IDstr[id_i];
								 }
							}
						}
						if(!ids.equals(""))
						{
							if(type.equals("FamilyName")){type="Gene";}
							String Annotation_k=anno[0]+"\t"+anno[1]+"\t"+anno[2]+"\t"+type+"\tTax:9606";
							if(anno.length>=5)
							{
								Annotation_k=anno[0]+"\t"+anno[1]+"\t"+anno[2]+"\t"+type+"\t"+anno[4];
							}
							GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k,Annotation_k+"|"+ids);
						}
						else
						{
							GNormPlus.BioCDocobj.Annotations.get(i).get(j).remove(k);
						}
					}
				}
			}
			//Species "*" and "(anti)" removed.
			for(int j=0;j<GNormPlus.BioCDocobj.Annotations.get(i).size();j++) // Paragraph
			{
				for (int k = GNormPlus.BioCDocobj.Annotations.get(i).get(j).size()-1; k >=0 ; k--) // Annotation : k
				{
					String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
					String type=anno[3];
					if(type.equals("Species") || type.equals("Genus") || type.equals("Strain") || type.equals("CellLine") || type.equals("Cell"))
					{
						String id=anno[4];
						id=id.replaceAll("\\*", "");
						id=id.replaceAll("\\(anti\\)", "");
						String Annotation_k=anno[0]+"\t"+anno[1]+"\t"+anno[2]+"\t"+type+"\t"+id;
						GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k,Annotation_k);
					}
				}
			}
			
			for(int j=0;j<GNormPlus.BioCDocobj.Annotations.get(i).size();j++) // Paragraph
			{
				
				for (int k = GNormPlus.BioCDocobj.Annotations.get(i).get(j).size()-1; k >=0 ; k--) // Annotation : k
				{
					String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
					int start = Integer.parseInt(anno[0]);
					int last = Integer.parseInt(anno[1]);
					String mention = anno[2];
					String type = anno[3];
					String id = anno[4];
					if(type.equals("Gene") && Species_hash.containsKey(mention))
					{
						GNormPlus.BioCDocobj.Annotations.get(i).get(j).remove(k);
					}
					else if(type.equals("Gene") && id.equals(""))
					{
						GNormPlus.BioCDocobj.Annotations.get(i).get(j).remove(k);
					}
					else
					{
						for (int k1 = GNormPlus.BioCDocobj.Annotations.get(i).get(j).size()-1; k1 >=0 ; k1--) // Annotation : k
						{
							if(k1 != k)
							{
								String anno1[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k1).split("\t");
								int start1 = Integer.parseInt(anno1[0]);
								int last1 = Integer.parseInt(anno1[1]);
								if((start1<start && last1>=last) || (start1<=start && last1>last))
								{
									GNormPlus.BioCDocobj.Annotations.get(i).get(j).remove(k);
									break;
								}
							}
						}
					}
				}
			}
		}
		if(GeneIDMatch == true)
		{
			//GNormPlus.BioCDocobj.BioCOutput(Filename,FilenameBioC,GNormPlus.BioCDocobj.Annotations,false,true);
		}
		else
		{
			GNormPlus.BioCDocobj.BioCOutput(Filename,FilenameBioC,GNormPlus.BioCDocobj.Annotations,true,true);
		}
	}
	/*
	 * Search Potential GeneID in the Prefix Tree
	 */
	public ArrayList<String> SearchGeneIDLocation(String Doc)
	{
		ArrayList<String> location = new ArrayList<String>();
		
		String Doc_tmp=" "+Doc+" ";
		Pattern ptmp = Pattern.compile("^(.*[^A-Za-z0-9]+)([0-9]+\\S*[A-Za-z]+|[A-Za-z]+\\S*[0-9]+|[0-9]+\\S*[A-Za-z]+\\S*[0-9]+|[A-Za-z]+\\S*[0-9]+\\S*[A-Za-z]+)([^A-Za-z0-9]+.*)$");
		Matcher mtmp = ptmp.matcher(Doc_tmp);
		while(mtmp.find())
		{
			String str1=mtmp.group(1);
			String str2=mtmp.group(2);
			String str3=mtmp.group(3);
			for(int m=str1.length();m<=(str1.length()+str2.length());m++)
			{
				int start = str1.length()-1;
				int last = start+str2.length();
				String mention = Doc.substring(start, last);
				if(!mention.matches(".*[\\'\\;\\[\\]\\+\\*\\\\].*"))
				{
					if(last-start>6 && (mention.matches(".*\\(.*\\).*") || mention.matches("[^\\(\\)]+")) )
					{
						Pattern ptmp1 = Pattern.compile("^(.+[^0-9])([0-9]+)\\-([0-9]+)$");
						Matcher mtmp1 = ptmp1.matcher(mention);
						Pattern ptmp2 = Pattern.compile("^(.+[^0-9])([0-9]+)\\-(.+[^0-9])([0-9]+)$");
						Matcher mtmp2 = ptmp2.matcher(mention);
						if(mtmp1.find())
						{
							String S1 = mtmp1.group(1);
							if(mtmp1.group(2).length()<=6 && mtmp1.group(3).length()<=6)
							{
								int Num1 = Integer.parseInt(mtmp1.group(2));
								int Num2 = Integer.parseInt(mtmp1.group(3));
								String prefix = "";
								Pattern ptmp3 = Pattern.compile("^([0]+)");
								Matcher mtmp3 = ptmp3.matcher(mtmp1.group(2));
								if(mtmp3.find())
								{
									prefix = mtmp3.group(1);
								}
								if(Num2-Num1>0 && (Num2-Num1<=20))
								{
									for(int n=Num1;n<=Num2;n++)
									{
										String StrNum=S1+prefix+n;
										if(StrNum.length()>=5)
										{
											location.add(start+"\t"+last+"\t"+StrNum+"\tGeneID");
										}
									}
								}
							}
						}
						else if(mtmp2.find())
						{
							if(mtmp2.group(2).length()<=6 && mtmp2.group(4).length()<=6)
							{
								String S1 = mtmp2.group(1);
								int Num1 = Integer.parseInt(mtmp2.group(2));
								String S2 = mtmp2.group(3);
								int Num2 = Integer.parseInt(mtmp2.group(4));
								if(S1.equals(S2))
								{
									String prefix = "";
									Pattern ptmp3 = Pattern.compile("^([0]+)");
									Matcher mtmp3 = ptmp3.matcher(mtmp2.group(2));
									if(mtmp3.find())
									{
										prefix = mtmp3.group(1);
									}
									if(Num2-Num1>0 && (Num2-Num1<=20))
									{
										for(int n=Num1;n<=Num2;n++)
										{
											String StrNum=S1+prefix+n;
											if(StrNum.length()>=5)
											{
												location.add(start+"\t"+last+"\t"+StrNum+"\tGeneID");
											}
										}
									}
								}
							}
						}
					}
					location.add(start+"\t"+last+"\t"+mention+"\tGeneID");
				}
			}
			String men="";
			for(int m=0;m<str2.length();m++){men=men+"@";}
			Doc_tmp=str1+men+str3;
			mtmp = ptmp.matcher(Doc_tmp);
		}
		return location;
	}
	public void GeneIDRecognition(String Filename,String FilenameBioC) throws IOException, XMLStreamException
	{
		for (int i = 0; i < GNormPlus.BioCDocobj.PMIDs.size(); i++) /** PMIDs : i */
		{
			for (int j = 0; j < GNormPlus.BioCDocobj.PassageNames.get(i).size(); j++) /** Paragraphs : j */
			{
				String PassageContext = GNormPlus.BioCDocobj.PassageContexts.get(i).get(j); // Passage context
				/** GeneID recognition by pattern match */
				ArrayList<String> locations = SearchGeneIDLocation(PassageContext);
				for (int k = 0 ; k < locations.size() ; k++)
				{
					String anno[]=locations.get(k).split("\t");
					String mention = anno[2].toLowerCase();
	        		mention = mention.replaceAll("[\\W\\-\\_]+", "");
	        		if(GNormPlus.GeneIDs_hash.containsKey(mention))
	        		{
	        			GNormPlus.BioCDocobj.Annotations.get(i).get(j).add(locations.get(k)+"\tGeneID:"+GNormPlus.GeneIDs_hash.get(mention)); //paragraph
	        		}
				}
			}
		}
		GNormPlus.BioCDocobj.BioCOutput(Filename,FilenameBioC,GNormPlus.BioCDocobj.Annotations,true,true);
	}
}