/** * Project: GNormPlus * Function: Gene Normalization */ package GNormPluslib; import bioc.BioCAnnotation; import bioc.BioCCollection; import bioc.BioCDocument; import bioc.BioCLocation; import bioc.BioCPassage; import bioc.io.BioCDocumentWriter; import bioc.io.BioCFactory; import bioc.io.woodstox.ConnectorWoodstox; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.text.BreakIterator; import java.time.LocalDate; import java.time.ZoneId; import java.text.DecimalFormat; import java.math.RoundingMode; import javax.xml.stream.XMLStreamException; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Locale; public class GN { public static HashMap MatchedTokens_hash = new HashMap(); private double ScoringFunction(String geneid,HashMap Mention_hash,String LF) { /* * define gene/homo id */ //LF LF = LF.toLowerCase(); LF = LF.replaceAll("([0-9])([a-z])", "$1 $2"); LF = LF.replaceAll("([a-z])([0-9])", "$1 $2"); LF = LF.replaceAll("([\\W\\-\\_])", " "); LF = LF.replaceAll("[ ]+", " "); String LF_tkn[]=LF.split(" "); int LF_ParticalMatch = 0; Pattern ptmp = Pattern.compile("[0-9]+\\-([0-9]+)"); Matcher mtmp = ptmp.matcher(geneid); Pattern ptmp2 = Pattern.compile("([0-9]+)"); Matcher mtmp2 = ptmp.matcher(geneid); if(mtmp.find()) { geneid = "Homo:"+mtmp.group(1); } else { geneid = "Gene:"+geneid; } if(GNormPlus.GeneScoring_hash.containsKey(geneid)) { HashMap TF = new HashMap(); // token i in gene j HashMap TermFrequency = new HashMap(); /* * Tokens in Query (Gene id lexicon) */ String l[]=GNormPlus.GeneScoring_hash.get(geneid).split("\t"); // Gene:2664293 cmk-1,cytidylate-1,kinase-1,mssa-1 0.4096 4 0.0625 1 2.0 String tkns_Gene[] = l[0].split(","); for(int i=0;i0){score = score + LF_ParticalMatch;/*System.out.println(geneid+"\t"+LF+"\t"+score);*/} return score; } else { //System.out.println("Error: cannot find geneid: "+geneid+" in GeneScoring_hash"); return 0.0; } } public void PreProcessing4GN(String Filename,String FilenameBioC) throws IOException, XMLStreamException { for (int i = 0; i < GNormPlus.BioCDocobj.Annotations.size(); i++) { for (int j = 0; j < GNormPlus.BioCDocobj.Annotations.get(i).size(); j++) { for (int k = 0; k < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); k++) { String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t"); String start=anno[0]; String last=anno[1]; String mentions=anno[2]; String type=anno[3]; String id=""; if(anno.length>=5) { id=anno[4]; } if(type.equals("Gene")) { String mentionArr[] = mentions.split("\\|"); boolean update=false; for(int m=0;m locations = GNormPlus.PT_GeneChromosome.SearchMentionLocation(PassageContext,"ChromosomeLocation"); for (int k = 0 ; k < locations.size() ; k++) { String anno[]=locations.get(k).split("\t"); //int start= Integer.parseInt(anno[0]); //int last= Integer.parseInt(anno[1]); //String mention = anno[2]; String ids = anno[3]; //GNormPlus.BioCDocobj.Annotations.get(i).get(j).add(start+"\t"+last+"\t"+mention+"\tChromosomeLocation\t"+ids); //paragraph String IDs[] = ids.split("[\\|,]"); for(int idcount=0;idcount Species_hash = new HashMap(); for (int j = 0; j < GNormPlus.BioCDocobj.Annotations.get(i).size(); j++) /** Paragraphs : j */ { for (int k = 0; k < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); k++) /** Annotation : k */ { String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t"); String mentions=anno[2]; String type=anno[3]; if(type.matches("(Species|Genus|Strain|CellLine|Cell)")) { Species_hash.put(mentions,""); } } } /* * Collect Gene mentions : * * GeneMention-taxid -> "ID" : geneid * -> "type" : "Gene" * -> start1-last1 : "" * -> start2-last2 : "" * -> start3-last3 : "" */ String tiabs=""; for (int j = 0; j < GNormPlus.BioCDocobj.PassageContexts.get(i).size(); j++) /** Paragraphs : j */ { tiabs=tiabs+GNormPlus.BioCDocobj.PassageContexts.get(i).get(j).toLowerCase(); } HashMap> GeneMention_hash = new HashMap>(); HashMap Mention_hash = new HashMap(); for (int j = 0; j < GNormPlus.BioCDocobj.Annotations.get(i).size(); j++) /** Paragraphs : j */ { for (int k = 0; k < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); k++) /** Annotation : k */ { String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t"); String start=anno[0]; String last=anno[1]; String mentions=anno[2]; String type=anno[3]; String taxids="Tax:9606"; if(anno.length>=5) { taxids=anno[4]; } String mentions_tmp=mentions.toLowerCase(); mentions_tmp=mentions_tmp.replaceAll("[\\W\\-\\_]",""); mentions_tmp=mentions_tmp.replaceAll("[0-9]","0"); taxids=taxids.replaceAll("(Focus|Right|Left|Prefix|Tax):",""); if(taxids.equals("")) { taxids="9606"; } /** Filtering */ boolean found_filter = false; if(GNormPlus.Filtering_hash.containsKey(mentions_tmp)) // filtering { found_filter=true; } if(found_filter==false) //abbreviation { for(String f : GNormPlus.Filtering_WithLongForm_hash.keySet()) { if( GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).matches(".*[\\t\\|]"+f+"\tGene.*") || GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).matches(".*\\t"+f+"\\|[^\t]+\tGene.*") ) { String lf=GNormPlus.Filtering_WithLongForm_hash.get(f); if(tiabs.matches(".*"+lf+".*")) { found_filter=true; break; } } } } if(found_filter==false) { if( GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).matches(".*[\\t\\|][a-z]\tGene.*") || GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).matches(".*\\t[a-z]\\|[^\t]+\tGene.*") //32171191 Wuhan's ) { found_filter=true; } } if(found_filter == false) { if(type.matches("Gene")) { if(GeneMention_hash.containsKey(mentions+"\t"+taxids)) { GeneMention_hash.get(mentions+"\t"+taxids).put(start+"\t"+last,""); } else { HashMap offset_hash = new HashMap(); offset_hash.put(start+"\t"+last,""); GeneMention_hash.put(mentions+"\t"+taxids, offset_hash); GeneMention_hash.get(mentions+"\t"+taxids).put("type", type); Mention_hash.put(mentions,"Gene"); } } else if(type.matches("(FamilyName|DomainMotif)")) { String GMs[]=mentions.split("\\|"); for(int g=0;g GuaranteedGene2ID = new HashMap(); HashMap MultiGene2ID = new HashMap(); for(String GeneMentionTax : GeneMention_hash.keySet()) { String GT[]=GeneMentionTax.split("\\t"); String mentions=GT[0]; String taxids=GT[1]; String GMs[]=mentions.split("\\|"); HashMap taxids_hash = new HashMap(); String taxids_arr[]=taxids.split(","); for(int t=0;t1) //{ // System.out.println(Pmid+"\t"+mention+"\t"+mentions+"\t"+IDstr); //} for(int c=0;c Abbreviation */ for(String GeneMentionTax : GeneMention_hash.keySet()) { String MT[] = GeneMentionTax.split("\\t"); if(GNormPlus.PmidLF2Abb_hash.containsKey(Pmid+"\t"+MT[0])) { String GeneMentionTax_Abb = GNormPlus.PmidLF2Abb_hash.get(Pmid+"\t"+MT[0]) + "\t" + MT[1]; if(GeneMention_hash.containsKey(GeneMentionTax_Abb) && GeneMention_hash.get(GeneMentionTax).containsKey("ID")) { GeneMention_hash.get(GeneMentionTax_Abb).put("ID", GeneMention_hash.get(GeneMentionTax).get("ID")); } } } /* * Gene id refinement: * 5. Ranking by scoring function (inference network) */ for(String GeneMentionTax : GeneMention_hash.keySet()) { if(GeneMention_hash.get(GeneMentionTax).containsKey("ID") && GeneMention_hash.get(GeneMentionTax).get("ID").matches(".+,.+")) { String geneids=GeneMention_hash.get(GeneMentionTax).get("ID"); String geneid[] = geneids.split(","); String OutputStyle="Top1"; if(OutputStyle.equals("Top1")) { //only return the best one double max_score=0.0; String target_geneid=""; for(int g=0;gmax_score) { max_score=score; target_geneid=geneid[g]; } else if(score == 0.0) { //System.out.println(GeneMentionTax); } } GeneMention_hash.get(GeneMentionTax).put("ID", target_geneid); } else // "All" { //return all geneids String geneSTR=""; for(int g=0;g FullName * */ for(String GeneMentionTax : GeneMention_hash.keySet()) { String MT[] = GeneMentionTax.split("\\t"); if(GNormPlus.PmidAbb2LF_hash.containsKey(Pmid+"\t"+MT[0])) { String GeneMentionTax_LF = GNormPlus.PmidAbb2LF_hash.get(Pmid+"\t"+MT[0]) + "\t" + MT[1]; if(GeneMention_hash.containsKey(GeneMentionTax_LF) && GeneMention_hash.get(GeneMentionTax).containsKey("ID")) { GeneMention_hash.get(GeneMentionTax_LF).put("ID", GeneMention_hash.get(GeneMentionTax).get("ID")); } } } /* * Gene id refinement: * 7. The inference network tokens of Abbreviation.ID should contain at least LF tokens * 8. The short mention should be filtered if not long form support */ ArrayList removeGMT = new ArrayList(); for(String GeneMentionTax : GeneMention_hash.keySet()) { String GT[]=GeneMentionTax.split("\\t"); String mentions=GT[0]; String tax=GT[1]; if(GeneMention_hash.get(GeneMentionTax).containsKey("type") && GeneMention_hash.get(GeneMentionTax).get("type").equals("Gene") && GeneMention_hash.get(GeneMentionTax).containsKey("ID")) { String type = GeneMention_hash.get(GeneMentionTax).get("type"); String id = GeneMention_hash.get(GeneMentionTax).get("ID"); String geneid=""; Pattern ptmp1 = Pattern.compile("^([0-9]+)\\-([0-9]+)$"); Pattern ptmp2 = Pattern.compile("^([0-9]+)$"); Matcher mtmp1 = ptmp1.matcher(id); Matcher mtmp2 = ptmp2.matcher(id); //System.out.println(id); if(mtmp1.find()) { geneid = "Homo:"+mtmp1.group(2); } else if(mtmp2.find()) { geneid = "Gene:"+mtmp2.group(1); } boolean LongFormTknMatch= false; boolean LongFormExist= true; if(GNormPlus.GeneScoring_hash.containsKey(geneid)) { if(GNormPlus.PmidAbb2LF_lc_hash.containsKey(Pmid+"\t"+mentions.toLowerCase())) { /* * token in lexicon : tkn_lexicon * token in mention : tkn_mention */ String l[]=GNormPlus.GeneScoring_hash.get(geneid).split("\t"); // Gene:2664293 cmk-1,cytidylate-1,kinase-1,mssa-1 0.4096 4 0.0625 1 2.0 String tkns_Gene[] = l[0].split(","); ArrayList tkn_lexicon = new ArrayList(); for(int ti=0;ti=5) { taxid_org=anno[4]; } String taxids=taxid_org.replaceAll("(Focus|Right|Left|Prefix|Tax):",""); String GMs[]=mentions.split("\\|"); if(GeneMention_hash.containsKey(mentions+"\t"+taxids) && GeneMention_hash.get(mentions+"\t"+taxids).containsKey("TargetTax")) { String taxtype=taxid_org.replaceAll(":([0-9,]+)",""); String taxid=GeneMention_hash.get(mentions+"\t"+taxids).get("TargetTax"); GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, start+"\t"+last+"\t"+mentions+"\t"+type+"\t"+taxtype+":"+taxid); } if(type.equals("Gene")) { GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k) + "|"); if(GeneMention_hash.containsKey(mentions+"\t"+taxids) && GeneMention_hash.get(mentions+"\t"+taxids).containsKey("ID")) { GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k) + GeneMention_hash.get(mentions+"\t"+taxids).get("ID") + "," ); } else // cannot find appropriate species { //System.out.println(mention+"\t"+taxid); } GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).substring(0, GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).length()-1)); // remove ",$" } } } //Extend to all gene mentions HashMap GeneMentions = new HashMap(); // Extending Gene mentions HashMap GeneMentionLocation = new HashMap(); // Extending Gene mentions for(int j=0;j=5) { id=anno[4]; } if(type.equals("Gene") && id.matches("(Focus|Right|Left|Prefix|Tax)\\:([0-9]+)\\|([0-9]+)\\-([0-9]+)")) { GeneMentions.put(mentions.toLowerCase(), id); for (int s=start ;s<=last;s++) { GeneMentionLocation.put(j+"\t"+s,""); } } else if(type.equals("Gene") && id.matches("(Focus|Right|Left|Prefix|Tax)\\:([0-9]+)\\|([0-9]+)")) { GeneMentions.put(mentions.toLowerCase(), id); for (int s=start ;s<=last;s++) { GeneMentionLocation.put(j+"\t"+s,""); } } } } for(int j=0;ji && GNormPlus.BioCDocobj.PassageContexts.get(i).size()>j) { String PassageContexts = " " + GNormPlus.BioCDocobj.PassageContexts.get(i).get(j) + " "; String PassageContexts_tmp = PassageContexts.toLowerCase(); for(String gm : GeneMentions.keySet()) { String id = GeneMentions.get(gm); if(gm.length()>=3) { gm = gm.replaceAll("[ ]*[\\|]*$", ""); gm = gm.replaceAll("^[\\|]*[ ]*", ""); gm = gm.replaceAll("[\\|][\\|]+", "\\|"); if(!gm.matches("[\\W\\-\\_]*")) { gm = gm.replaceAll("([^A-Za-z0-9\\| ])", "\\\\$1"); Pattern ptmp = Pattern.compile("^(.*[\\W\\-\\_])("+gm+")([\\W\\-\\_].*)$"); Matcher mtmp = ptmp.matcher(PassageContexts_tmp); while(mtmp.find()) { String pre = mtmp.group(1); String gmtmp = mtmp.group(2); String post = mtmp.group(3); int start = pre.length()-1; int last = start+gmtmp.length(); if(PassageContexts.length()>=last+1) { String mention = PassageContexts.substring(start+1,last+1); if(!GeneMentionLocation.containsKey(j+"\t"+start) && !GeneMentionLocation.containsKey(j+"\t"+last)) { GNormPlus.BioCDocobj.Annotations.get(i).get(j).add(start+"\t"+last+"\t"+mention+"\tGene\t"+id); } } gmtmp = gmtmp.replaceAll(".", "\\@"); PassageContexts_tmp=pre+""+gmtmp+""+post; mtmp = ptmp.matcher(PassageContexts_tmp); } } } } } } //Apply to FamilyNames HashMap geneids = new HashMap(); // Extending Gene mentions for(int j=0;j=5) { id=anno[4]; } Pattern ptmp0 = Pattern.compile("^(Focus|Right|Left|Prefix|GeneID|Tax)\\:([0-9]+)\\|([0-9]+)$"); Matcher mtmp0 = ptmp0.matcher(id); Pattern ptmp1 = Pattern.compile("^(Focus|Right|Left|Prefix|GeneID|Tax)\\:([0-9]+)\\|([0-9]+)\\-([0-9]+)$"); Matcher mtmp1 = ptmp1.matcher(id); if(mtmp0.find()) { geneids.put(mtmp0.group(3), ""); } if(mtmp1.find()) { geneids.put(mtmp1.group(3), ""); } } } } for(int j=0;j=0 ; k--) // Annotation : k { String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t"); String mention=anno[2]; String type=anno[3]; if(type.matches("(FamilyName|DomainMotif)")) { String id="Tax:9606"; if(anno.length>=5) { id=anno[4]; } String IDstrs = GNormPlus.PT_FamilyName.MentionMatch(mention); String IDstr[]=IDstrs.split("\\|"); String ids=""; for(int id_i=0;id_i=5) { Annotation_k=anno[0]+"\t"+anno[1]+"\t"+anno[2]+"\t"+type+"\t"+anno[4]; } GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k,Annotation_k+"|"+ids); } else { GNormPlus.BioCDocobj.Annotations.get(i).get(j).remove(k); } } } } //Species "*" and "(anti)" removed. for(int j=0;j=0 ; k--) // Annotation : k { String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t"); String type=anno[3]; if(type.equals("Species") || type.equals("Genus") || type.equals("Strain") || type.equals("CellLine") || type.equals("Cell")) { String id=anno[4]; id=id.replaceAll("\\*", ""); id=id.replaceAll("\\(anti\\)", ""); String Annotation_k=anno[0]+"\t"+anno[1]+"\t"+anno[2]+"\t"+type+"\t"+id; GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k,Annotation_k); } } } for(int j=0;j=0 ; k--) // Annotation : k { String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t"); int start = Integer.parseInt(anno[0]); int last = Integer.parseInt(anno[1]); String mention = anno[2]; String type = anno[3]; String id = anno[4]; if(type.equals("Gene") && Species_hash.containsKey(mention)) { GNormPlus.BioCDocobj.Annotations.get(i).get(j).remove(k); } else if(type.equals("Gene") && id.equals("")) { GNormPlus.BioCDocobj.Annotations.get(i).get(j).remove(k); } else { for (int k1 = GNormPlus.BioCDocobj.Annotations.get(i).get(j).size()-1; k1 >=0 ; k1--) // Annotation : k { if(k1 != k) { String anno1[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k1).split("\t"); int start1 = Integer.parseInt(anno1[0]); int last1 = Integer.parseInt(anno1[1]); if((start1=last) || (start1<=start && last1>last)) { GNormPlus.BioCDocobj.Annotations.get(i).get(j).remove(k); break; } } } } } } } if(GeneIDMatch == true) { //GNormPlus.BioCDocobj.BioCOutput(Filename,FilenameBioC,GNormPlus.BioCDocobj.Annotations,false,true); } else { GNormPlus.BioCDocobj.BioCOutput(Filename,FilenameBioC,GNormPlus.BioCDocobj.Annotations,true,true); } } /* * Search Potential GeneID in the Prefix Tree */ public ArrayList SearchGeneIDLocation(String Doc) { ArrayList location = new ArrayList(); String Doc_tmp=" "+Doc+" "; Pattern ptmp = Pattern.compile("^(.*[^A-Za-z0-9]+)([0-9]+\\S*[A-Za-z]+|[A-Za-z]+\\S*[0-9]+|[0-9]+\\S*[A-Za-z]+\\S*[0-9]+|[A-Za-z]+\\S*[0-9]+\\S*[A-Za-z]+)([^A-Za-z0-9]+.*)$"); Matcher mtmp = ptmp.matcher(Doc_tmp); while(mtmp.find()) { String str1=mtmp.group(1); String str2=mtmp.group(2); String str3=mtmp.group(3); for(int m=str1.length();m<=(str1.length()+str2.length());m++) { int start = str1.length()-1; int last = start+str2.length(); String mention = Doc.substring(start, last); if(!mention.matches(".*[\\'\\;\\[\\]\\+\\*\\\\].*")) { if(last-start>6 && (mention.matches(".*\\(.*\\).*") || mention.matches("[^\\(\\)]+")) ) { Pattern ptmp1 = Pattern.compile("^(.+[^0-9])([0-9]+)\\-([0-9]+)$"); Matcher mtmp1 = ptmp1.matcher(mention); Pattern ptmp2 = Pattern.compile("^(.+[^0-9])([0-9]+)\\-(.+[^0-9])([0-9]+)$"); Matcher mtmp2 = ptmp2.matcher(mention); if(mtmp1.find()) { String S1 = mtmp1.group(1); if(mtmp1.group(2).length()<=6 && mtmp1.group(3).length()<=6) { int Num1 = Integer.parseInt(mtmp1.group(2)); int Num2 = Integer.parseInt(mtmp1.group(3)); String prefix = ""; Pattern ptmp3 = Pattern.compile("^([0]+)"); Matcher mtmp3 = ptmp3.matcher(mtmp1.group(2)); if(mtmp3.find()) { prefix = mtmp3.group(1); } if(Num2-Num1>0 && (Num2-Num1<=20)) { for(int n=Num1;n<=Num2;n++) { String StrNum=S1+prefix+n; if(StrNum.length()>=5) { location.add(start+"\t"+last+"\t"+StrNum+"\tGeneID"); } } } } } else if(mtmp2.find()) { if(mtmp2.group(2).length()<=6 && mtmp2.group(4).length()<=6) { String S1 = mtmp2.group(1); int Num1 = Integer.parseInt(mtmp2.group(2)); String S2 = mtmp2.group(3); int Num2 = Integer.parseInt(mtmp2.group(4)); if(S1.equals(S2)) { String prefix = ""; Pattern ptmp3 = Pattern.compile("^([0]+)"); Matcher mtmp3 = ptmp3.matcher(mtmp2.group(2)); if(mtmp3.find()) { prefix = mtmp3.group(1); } if(Num2-Num1>0 && (Num2-Num1<=20)) { for(int n=Num1;n<=Num2;n++) { String StrNum=S1+prefix+n; if(StrNum.length()>=5) { location.add(start+"\t"+last+"\t"+StrNum+"\tGeneID"); } } } } } } } location.add(start+"\t"+last+"\t"+mention+"\tGeneID"); } } String men=""; for(int m=0;m locations = SearchGeneIDLocation(PassageContext); for (int k = 0 ; k < locations.size() ; k++) { String anno[]=locations.get(k).split("\t"); String mention = anno[2].toLowerCase(); mention = mention.replaceAll("[\\W\\-\\_]+", ""); if(GNormPlus.GeneIDs_hash.containsKey(mention)) { GNormPlus.BioCDocobj.Annotations.get(i).get(j).add(locations.get(k)+"\tGeneID:"+GNormPlus.GeneIDs_hash.get(mention)); //paragraph } } } } GNormPlus.BioCDocobj.BioCOutput(Filename,FilenameBioC,GNormPlus.BioCDocobj.Annotations,true,true); } }