/** * Project: GNormPlus * Function: Species recognition and Species assignment */ package GNormPluslib; import bioc.BioCAnnotation; import bioc.BioCCollection; import bioc.BioCDocument; import bioc.BioCLocation; import bioc.BioCPassage; import bioc.io.BioCDocumentWriter; import bioc.io.BioCFactory; import bioc.io.woodstox.ConnectorWoodstox; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.text.BreakIterator; import java.time.LocalDate; import java.time.ZoneId; import javax.xml.stream.XMLStreamException; import org.tartarus.snowball.SnowballStemmer; import org.tartarus.snowball.ext.englishStemmer; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Locale; import java.util.Collections; public class SR { @SuppressWarnings("null") public void SpeciesRecognition(String Filename,String FilenameBioC,String StrainFilename,String FilterAntibody) throws IOException, XMLStreamException { /** Recognizing Species Names: SP */ for (int i = 0; i < GNormPlus.BioCDocobj.PMIDs.size(); i++) /** PMIDs : i */ { String Pmid = GNormPlus.BioCDocobj.PMIDs.get(i); PrefixTree PT_Genus = new PrefixTree(); HashMap SPID_hash = new HashMap(); ArrayList TargetedLocation = new ArrayList(); HashMap GenusNames = new HashMap(); HashMap Mention2ID_lc = new HashMap(); ArrayList IDset = new ArrayList(); for (int j = 0; j < GNormPlus.BioCDocobj.PassageNames.get(i).size(); j++) /** Paragraphs : j */ { String PassageContext = GNormPlus.BioCDocobj.PassageContexts.get(i).get(j); // Passage context /** Species recognition */ ArrayList locations = GNormPlus.PT_Species.SearchMentionLocation(PassageContext,"Species"); /** PT_Species */ for (int k = 0 ; k < locations.size() ; k++) { String anno[]=locations.get(k).split("\t"); int start= Integer.parseInt(anno[0]); int last= Integer.parseInt(anno[1]); // For anti-serum filtering String ForwardSTR=""; String BackwardSTR=""; if(start>21) { ForwardSTR = (PassageContext+"ZZZZZZZZZZZZZZZZZZZZZZZZZZZ").substring(start-21,last); } else { ForwardSTR = (PassageContext+"ZZZZZZZZZZZZZZZZZZZZZZZZZZZ").substring(0,last); } if(PassageContext.length()>last+21) { BackwardSTR = PassageContext.substring(start,last+21); } else { BackwardSTR = PassageContext.substring(start,PassageContext.length()); } String mention = anno[2]; String id = anno[3]; String mention_tmp=mention.toLowerCase(); mention_tmp = mention_tmp.replaceAll("([^A-Za-z0-9@ ])", "\\\\$1"); String antibody=""; if(ForwardSTR.toLowerCase().matches(".*(anti|antibody|antibodies|serum|polyclonal|monoclonal|igg)[\\W\\-\\_]+"+mention_tmp)) {antibody="(anti)";}//filtering : antibody else if(BackwardSTR.toLowerCase().matches(mention_tmp+"[\\W\\-\\_]+(anti|antibody|antibodies|serum|polyclonal|monoclonal|igg).*")){antibody="(anti)";} //filtering : antibody else if(BackwardSTR.toLowerCase().matches(mention_tmp+"[\\W\\-\\_]+[A-Za-z0-9]+[\\W\\-\\_]+(anti|antibody|antibodies|serum|polyclonal|monoclonal|igg).*")){antibody="(anti)";} //filtering : antibody if(mention.matches(".*[\\(\\[\\{].*") && BackwardSTR.toLowerCase().matches(mention_tmp+"\\).*") ) { last=last+1; mention=mention+")"; } if(BackwardSTR.toLowerCase().matches(mention_tmp+"[0-9].*")){} // filtered: Bee1p else if((mention.matches(".*[;:,].*")) && mention.length()<=10){} // filtered : x, XXX else if(mention.matches("to[\\W\\-\\_]+[0-9]+")){} // to 7 else if(mention.matches("[a-z][\\)\\]\\}].*") && (!mention.matches(".*[\\(\\[\\{].*")) && mention.length()<=10){} // s). Major else if(mention.matches(".*[\\(\\[\\{].*") && (!mention.matches(".*[\\)\\]\\}].*")) && mention.length()<=10){} // s). Major else if(!id.equals("NA")) { if(GNormPlus.BioCDocobj.Annotations.size()>i && GNormPlus.BioCDocobj.Annotations.get(i).size()>j) { if((!mention.matches("^[A-Za-z] [A-Za-z0-9]+$")) && (mention.length()>=3)) // invalid species: "a group/a GAL4/a strain" { if(FilterAntibody.equals("False") || (!antibody.equals("(anti)"))) { String patt="^(.+?) [sS]train"; Pattern ptmp = Pattern.compile(patt); Matcher mtmp = ptmp.matcher(mention); if(mtmp.find()) { mention=mtmp.group(1); last=last-7; } GNormPlus.BioCDocobj.Annotations.get(i).get(j).add(start+"\t"+last+"\t"+mention+"\tSpecies\t"+id); //+antibody String mentions_tmp=mention.toLowerCase(); mentions_tmp=mentions_tmp.replaceAll("[\\W\\-\\_]",""); mentions_tmp=mentions_tmp.replaceAll("[0-9]","0"); GNormPlus.Filtering_hash.put(mentions_tmp,""); Mention2ID_lc.put(mention.toLowerCase(), id); //+antibody String mention_genus = ""; patt="^([A-Za-z]+) "; ptmp = Pattern.compile(patt); mtmp = ptmp.matcher(mention); if(mtmp.find()) { mention_genus=mtmp.group(1); // get genus } IDset.add(id); for(int s=start;si && GNormPlus.BioCDocobj.Annotations.get(i).size()>j) { if(!TargetedLocation.contains(j+"\t"+start)) //already exists { int last40=0; if(PassageContext.length()>=last+40) { last40=last+40; } else { last40=PassageContext.length(); } // For anti-serum filtering String ForwardSTR=""; String BackwardSTR=""; if(start>21) { ForwardSTR = PassageContext.substring(start-21,last); } else { ForwardSTR = PassageContext.substring(0,last); } if(PassageContext.length()>last+21) { BackwardSTR = PassageContext.substring(start,last+21); } else { BackwardSTR = PassageContext.substring(start,PassageContext.length()); } String mention_tmp=mention.toLowerCase(); mention_tmp = mention_tmp.replaceAll("([^A-Za-z0-9@ ])", "\\\\$1"); if(mention_tmp.matches(".*[\\[\\]\\(\\)\\{\\}].*")){} else if(BackwardSTR.toLowerCase().matches(mention_tmp+"[0-9\\-\\_].*")){} // filtered: Bee1p else if(ForwardSTR.toLowerCase().matches(".*[0-9\\-\\_]"+mention_tmp)){} // filtered: IL-22RA1 else { String patt="[\\W\\-]cell([\\- ]*line|)[s]*[\\W\\-]"; Pattern ptmp = Pattern.compile(patt); Matcher mtmp = ptmp.matcher(PassageContext.substring(last, last40).toLowerCase()); if(mtmp.find()) { if(GNormPlus.taxid4gene.contains(id)) // for gene { id="*"+id; } GNormPlus.BioCDocobj.Annotations.get(i).get(j).add(start+"\t"+last+"\t"+mention+"\tCell\t"+id); String mentions_tmp=mention.toLowerCase(); mentions_tmp=mentions_tmp.replaceAll("[\\W\\-\\_]",""); mentions_tmp=mentions_tmp.replaceAll("[0-9]","0"); GNormPlus.Filtering_hash.put(mentions_tmp,""); IDset.add(id); for(int s=start;s=7) { GenusNames.put(ID,SPID_hash.get(ID)); } } } GenusNames.put("3702", "arabidopsis"); GenusNames.put("4932", "saccharomyces"); GenusNames.put("562", "escherichia"); GenusNames.put("7227", "drosophila"); GenusNames.put("8355", "xenopus"); PT_Genus.Hash2Tree(GenusNames); /** Genus recognition */ for (int j = 0; j < GNormPlus.BioCDocobj.PassageNames.get(i).size(); j++) /** Paragraphs : j */ { if(GNormPlus.BioCDocobj.PassageContexts.size()>i && GNormPlus.BioCDocobj.PassageContexts.get(i).size()>j && GNormPlus.BioCDocobj.Annotations.size()>i && GNormPlus.BioCDocobj.Annotations.get(i).size()>j ) { String PassageContext = GNormPlus.BioCDocobj.PassageContexts.get(i).get(j); ArrayList locations_Genus = PT_Genus.SearchMentionLocation(PassageContext,"Genus"); /** PT_Genus*/ for (int k = 0 ; k < locations_Genus.size() ; k++) { String anno[]=locations_Genus.get(k).split("\t"); String start= anno[0]; String last= anno[1]; String mention = anno[2]; String id = anno[3]; if(!TargetedLocation.contains(j+"\t"+start)) //already exists { String patt="^\\**([0-9]+)$"; Pattern ptmp = Pattern.compile(patt); Matcher mtmp = ptmp.matcher(id); if(mtmp.find()) { id = mtmp.group(1); } if(GNormPlus.taxid4gene.contains(id)) // for gene { id="*"+id; } GNormPlus.BioCDocobj.Annotations.get(i).get(j).add(start+"\t"+last+"\t"+mention+"\tGenus\t"+id); String mentions_tmp=mention.toLowerCase(); mentions_tmp=mentions_tmp.replaceAll("[\\W\\-\\_]",""); mentions_tmp=mentions_tmp.replaceAll("[0-9]","0"); GNormPlus.Filtering_hash.put(mentions_tmp,""); IDset.add(id); for(int s=Integer.parseInt(start);s StrainID_hash = new HashMap(); BufferedReader br = new BufferedReader(new FileReader(StrainFilename)); String line=""; while ((line = br.readLine()) != null) { String l[]=line.split("\t"); String ancestor = l[0]; String tax_id = l[1]; String tax_names = l[2]; if(SPID_hash.containsKey(ancestor)) { StrainID_hash.put(tax_id, tax_names); // tax id -> strain } else if(SPID_hash.containsKey(tax_id)) { StrainID_hash.put(tax_id, tax_names); // tax id -> strain } } br.close(); HashMap StrainNames = new HashMap(); for(String ID: StrainID_hash.keySet()) { StrainNames.put(ID,StrainID_hash.get(ID)); } PT_Strain.Hash2Tree(StrainNames); /** Strain recognition */ for (int j = 0; j < GNormPlus.BioCDocobj.PassageNames.get(i).size(); j++) /** Paragraphs : j */ { if(GNormPlus.BioCDocobj.PassageContexts.size()>i && GNormPlus.BioCDocobj.PassageContexts.get(i).size()>j && GNormPlus.BioCDocobj.Annotations.size()>i && GNormPlus.BioCDocobj.Annotations.get(i).size()>j ) { String PassageContext = GNormPlus.BioCDocobj.PassageContexts.get(i).get(j); // Passage context ArrayList locations_Strain = PT_Strain.SearchMentionLocation(PassageContext,"Strain"); /** PT_Strain*/ for (int k = 0 ; k < locations_Strain.size() ; k++) { String anno[]=locations_Strain.get(k).split("\t"); String start= anno[0]; String last= anno[1]; String mention = anno[2]; String id = anno[3]; if(!TargetedLocation.contains(j+"\t"+start)) //already exists { if((!mention.matches(".*[;,\\{\\}\\(\\)\\[\\]].*")) && !mention.matches("[a-z]{1,4} [0-9]{1,3}")) { if(GNormPlus.taxid4gene.contains(id)) // for gene { id="*"+id; } GNormPlus.BioCDocobj.Annotations.get(i).get(j).add(start+"\t"+last+"\t"+mention+"\tStrain\t"+id); String mentions_tmp=mention.toLowerCase(); mentions_tmp=mentions_tmp.replaceAll("[\\W\\-\\_]",""); mentions_tmp=mentions_tmp.replaceAll("[0-9]","0"); GNormPlus.Filtering_hash.put(mentions_tmp,""); IDset.add(id); for(int s=Integer.parseInt(start);s OtherNames = new HashMap(); for(String men : Mention2ID_lc.keySet()) { String men_id= Mention2ID_lc.get(men); if(GNormPlus.PmidLF2Abb_lc_hash.containsKey(Pmid+"\t"+men)) { String Abb = GNormPlus.PmidLF2Abb_lc_hash.get(Pmid+"\t"+men); // Abbreviation if(OtherNames.containsKey(men_id)) { OtherNames.put(men_id, OtherNames.get(men_id)+"|"+Abb); } else { OtherNames.put(men_id,Abb); } } String men_nospace=men.replaceAll(" ", ""); // no space if(OtherNames.containsKey(men_id)) { OtherNames.put(men_id, OtherNames.get(men_id)+"|"+men_nospace); } else { OtherNames.put(men_id,men_nospace); } } PrefixTree PT_Others = new PrefixTree(); PT_Others.Hash2Tree(OtherNames); /** * * Others: * 1) Abbreviation * 2) no space * * */ for (int j = 0; j < GNormPlus.BioCDocobj.PassageNames.get(i).size(); j++) /** Paragraphs : j */ { if(GNormPlus.BioCDocobj.PassageContexts.size()>i && GNormPlus.BioCDocobj.PassageContexts.get(i).size()>j && GNormPlus.BioCDocobj.Annotations.size()>i && GNormPlus.BioCDocobj.Annotations.get(i).size()>j ) { String PassageContext = GNormPlus.BioCDocobj.PassageContexts.get(i).get(j); // Passage context ArrayList locations_Abb = PT_Others.SearchMentionLocation(PassageContext,"Species"); /** PT_Abb*/ for (int k = 0 ; k < locations_Abb.size() ; k++) { String anno[]=locations_Abb.get(k).split("\t"); String start= anno[0]; String last= anno[1]; String mention = anno[2]; String id = anno[3]; if(!TargetedLocation.contains(j+"\t"+start)) //already exists { if(GNormPlus.taxid4gene.contains(id)) // for gene { id="*"+id; } GNormPlus.BioCDocobj.Annotations.get(i).get(j).add(start+"\t"+last+"\t"+mention+"\tSpecies\t"+id); String mentions_tmp=mention.toLowerCase(); mentions_tmp=mentions_tmp.replaceAll("[\\W\\-\\_]",""); mentions_tmp=mentions_tmp.replaceAll("[0-9]","0"); GNormPlus.Filtering_hash.put(mentions_tmp,""); Mention2ID_lc.put(mention.toLowerCase(), id); IDset.add(id); for(int s=Integer.parseInt(start);si && GNormPlus.BioCDocobj.PassageContexts.get(i).size()>j && GNormPlus.BioCDocobj.Annotations.size()>i && GNormPlus.BioCDocobj.Annotations.get(i).size()>j) { ArrayList remove_anno = new ArrayList (); for (int a = 0; a < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); a++) /** Annotations : a */ { String SpAnno[]=GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(a).split("\t"); String start= SpAnno[0]; String last= SpAnno[1]; String mention = SpAnno[2]; String type = SpAnno[3]; if(type.matches("Gene|FamilyName")) { GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(a,start+"\t"+last+"\t"+mention+"\t"+type); } else if(type.matches("Species|Genus|Strain|Cell") && SpAnno.length==5) { //System.out.println(GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(a)); /** Abbreviation solution */ if(GNormPlus.PmidAbb2LF_lc_hash.containsKey(Pmid+"\t"+mention.toLowerCase()) && Mention2ID_lc.containsKey(GNormPlus.PmidAbb2LF_lc_hash.containsKey(Pmid+"\t"+mention.toLowerCase()))) { String LF_lc=GNormPlus.PmidAbb2LF_lc_hash.get(Pmid+"\t"+mention.toLowerCase()); if(Mention2ID_lc.containsKey(LF_lc)) { String LF_ID=Mention2ID_lc.get(LF_lc); GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(a, start+"\t"+last+"\t"+mention+"\t"+type+"\t"+LF_ID); String mentions_tmp=mention.toLowerCase(); mentions_tmp=mentions_tmp.replaceAll("[\\W\\-\\_]",""); mentions_tmp=mentions_tmp.replaceAll("[0-9]","0"); GNormPlus.Filtering_hash.put(mentions_tmp,""); } } else if (SpAnno.length>4) { String id = SpAnno[4]; String id_split[]=id.split(";"); if(id_split.length>=2) { /** Smallest set of tax ids */ boolean found=false; for(int x=0;x= 0 ; counter--) { int ai=remove_anno.get(counter); //System.out.println("\n"+ai+"\t"+GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(ai)); GNormPlus.BioCDocobj.Annotations.get(i).get(j).remove(ai); } } } } GNormPlus.BioCDocobj.BioCOutput(Filename,FilenameBioC,GNormPlus.BioCDocobj.Annotations,false,true); //save in BioC file } public void SpeciesAssignment(String Filename,String FilenameBioC) throws IOException, XMLStreamException { GNormPlus.BioCDocobj.Annotations = new ArrayList(); GNormPlus.BioCDocobj.BioCReaderWithAnnotation(Filename); BreakIterator iterator = BreakIterator.getSentenceInstance(Locale.US); for (int i = 0; i < GNormPlus.BioCDocobj.Annotations.size(); i++) /** PMIDs : i */ { HashMap PrefixIDTarget_hash = new HashMap(); PrefixIDTarget_hash.put("9606", "h"); PrefixIDTarget_hash.put("10090", "m"); PrefixIDTarget_hash.put("10116", "r"); PrefixIDTarget_hash.put("4932", "y"); PrefixIDTarget_hash.put("7227", "d"); PrefixIDTarget_hash.put("7955", "z|zf|Zf|dr|Dr"); PrefixIDTarget_hash.put("3702", "at|At"); HashMap SP2Num_hash = new HashMap(); for (int j = 0; j < GNormPlus.BioCDocobj.Annotations.get(i).size(); j++) /** Paragraphs : j */ { for (int k = 0; k < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); k++) // Annotation : k { String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t"); if(anno.length==5) //Species { String patt="^\\**([0-9]+)$"; Pattern ptmp = Pattern.compile(patt); Matcher mtmp = ptmp.matcher(anno[4]); if(mtmp.find()) { String id = mtmp.group(1); if(!PrefixIDTarget_hash.containsKey(id)) { PrefixIDTarget_hash.put(id,GNormPlus.PrefixID_hash.get(id)); // taxid -> prefix } if(j == 0)//title { if(SP2Num_hash.containsKey(id)) { SP2Num_hash.put(id, SP2Num_hash.get(id)+2); } else { if(GNormPlus.TaxFreq_hash.containsKey(id)) { SP2Num_hash.put(id, GNormPlus.TaxFreq_hash.get(id)+2); } else { SP2Num_hash.put(id, 2.0); } } // Virus -> Human (not to double weight human to virus) /*if(GNormPlus.SP_Virus2Human_hash.containsKey(id)) { if(SP2Num_hash.containsKey("9606")) { SP2Num_hash.put("9606", SP2Num_hash.get("9606")+2); } else { SP2Num_hash.put("9606", 2 + GNormPlus.TaxFreq_hash.get("9606")+1); } }*/ } else { if(SP2Num_hash.containsKey(id)) { SP2Num_hash.put(id, SP2Num_hash.get(id)+1); } else { if(GNormPlus.TaxFreq_hash.containsKey(id)) { SP2Num_hash.put(id, 1 + GNormPlus.TaxFreq_hash.get(id)); } else { SP2Num_hash.put(id, 1.0); } } // Virus -> Human /*if(GNormPlus.SP_Virus2Human_hash.containsKey(id)) { if(SP2Num_hash.containsKey("9606")) { SP2Num_hash.put("9606", SP2Num_hash.get("9606")+1); } else { SP2Num_hash.put("9606", GNormPlus.TaxFreq_hash.get("9606")+1); } }*/ } } } } } String MajorSP="9606"; double MaxSP=0; for(String tid : SP2Num_hash.keySet()) { if(SP2Num_hash.get(tid)>MaxSP) { MajorSP=tid; MaxSP=SP2Num_hash.get(tid); } } for (int j = 0; j < GNormPlus.BioCDocobj.PassageContexts.get(i).size(); j++) /** Paragraphs : j */ { String PassageContext = GNormPlus.BioCDocobj.PassageContexts.get(i).get(j); // Passage context //int PassageOffset = GNormPlus.BioCDocobj.PassageOffsets.get(i).get(j); // Passage offset iterator.setText(PassageContext); ArrayList Sentence_offsets = new ArrayList(); int Sent_start = iterator.first(); for (int Sent_last = iterator.next(); Sent_last != BreakIterator.DONE; Sent_start = Sent_last, Sent_last = iterator.next()) { Sentence_offsets.add(Sent_start); } HashMap Annotations_Gene_hash = new HashMap(); ArrayList Annotations_Species = new ArrayList(); if(GNormPlus.BioCDocobj.Annotations.get(i).size()>j) { for (int k = 0; k < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); k++) // Annotation : k { String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t"); if(anno.length==5) //Species { Annotations_Species.add(GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k)); } else //Gene : if(anno.length==3) { //String mention = PassageContext.substring(Integer.parseInt(anno[0]), Integer.parseInt(anno[1])); Annotations_Gene_hash.put(k,GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k)); // k -> Gene Annotation } } //Gene --> Species Inference (PMID:28777492) HashMap> mention2Location2Species_hash = new HashMap>(); HashMap Location2Species_hash = new HashMap(); for (int k : Annotations_Gene_hash.keySet()) // k is the index of GNormPlus.BioCDocobj.Annotations.get(i).get(j) { boolean SPfound = false; String anno[] = Annotations_Gene_hash.get(k).split("\t"); int G_Start= Integer.parseInt(anno[0]); int G_Last= Integer.parseInt(anno[1]); String G_mentions = anno[2]; /** * 2. Co-occurring word * boundary : * Sentence Start: Sentence_offsets.get(Target_Sentence) * Sentence Last: Sentence_offsets.get(Target_Sentence+1) */ //Find the target sentence int Target_Sentence=0; if(SPfound == false) // 1. left : Closed to start of the gene mention { for(int s=0;s Target_Sentence+1){ Sentence_Last = Sentence_offsets.get(Target_Sentence+1); } if(SPfound == false) // 1. left : Closed to start of the gene mention { int closet_Sp_Start=0; for(int sp=0;sp= Sentence_Start && Sp_Start >closet_Sp_Start) { closet_Sp_Start=Sp_Start; Location2Species_hash.put(Integer.parseInt(anno[0]), taxid); if(mention2Location2Species_hash.containsKey(G_mentions.toLowerCase())) { mention2Location2Species_hash.get(G_mentions.toLowerCase()).put(Integer.parseInt(anno[0]), taxid); } else { mention2Location2Species_hash.put(G_mentions.toLowerCase(),Location2Species_hash); } SPfound=true; } } } } if(SPfound == false) // 2. right : Closed to last of the gene mention { int closet_Sp_Last=1000000; for(int sp=0;sp= G_Last && Sp_Last <= Sentence_Last && Sp_Last < closet_Sp_Last) { closet_Sp_Last=Sp_Last; Location2Species_hash.put(Integer.parseInt(anno[0]), taxid); if(mention2Location2Species_hash.containsKey(G_mentions.toLowerCase())) { mention2Location2Species_hash.get(G_mentions.toLowerCase()).put(Integer.parseInt(anno[0]), taxid); } else { mention2Location2Species_hash.put(G_mentions.toLowerCase(),Location2Species_hash); } SPfound=true; } } } } } for (int k : Annotations_Gene_hash.keySet()) // k is the index of GNormPlus.BioCDocobj.Annotations.get(i).get(j) { String anno[] = Annotations_Gene_hash.get(k).split("\t"); int G_Start= Integer.parseInt(anno[0]); int G_Last= Integer.parseInt(anno[1]); String G_mentions = anno[2]; String G_type = anno[3]; String G_mention_list[]=G_mentions.split("\\|"); String G_mention=G_mention_list[0]; // only use the first term to detect species ; should be updated after SimConcept /** 1. prefix */ boolean SPfound = false; for(String taxid: PrefixIDTarget_hash.keySet()) { if(GNormPlus.GeneWithoutSPPrefix_hash.containsKey(G_mention.toLowerCase())) { //special case, and no need for prefix - SA } else { Pattern ptmp = Pattern.compile("^("+PrefixIDTarget_hash.get(taxid)+")([A-Z].*)$"); Matcher mtmp = ptmp.matcher(G_mention); if(mtmp.find()) { String MentionWoPrefix=mtmp.group(2); GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, anno[0]+"\t"+anno[1]+"\t"+anno[2]+"|"+MentionWoPrefix+"\t"+anno[3]+"\tPrefix:"+taxid); SPfound=true; break; } } } /** * 2. Co-occurring word * boundary : * Sentence Start: Sentence_offsets.get(Target_Sentence) * Sentence Last: Sentence_offsets.get(Target_Sentence+1) */ //Find the target sentence int Target_Sentence=0; if(SPfound == false) // 1. left : Closed to start of the gene mention { for(int s=0;s Target_Sentence+1){ Sentence_Last = Sentence_offsets.get(Target_Sentence+1); } if(SPfound == false) // 1. left : Closed to start of the gene mention { int closet_Sp_Start=0; for(int sp=0;sp= Sentence_Start && Sp_Start >closet_Sp_Start) { closet_Sp_Start=Sp_Start; if(GNormPlus.SP_Virus2Human_hash.containsKey(taxid)) { GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tLeft:"+taxid+"&9606"); } else { GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tLeft:"+taxid); } SPfound=true; } } } } if(SPfound == false) // 2. right : Closed to last of the gene mention { int closet_Sp_Last=1000000; for(int sp=0;sp= G_Last && Sp_Last <= Sentence_Last && Sp_Last < closet_Sp_Last) { closet_Sp_Last=Sp_Last; if(GNormPlus.SP_Virus2Human_hash.containsKey(taxid)) { GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tRight:"+taxid+"&9606"); } else { GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tRight:"+taxid); } SPfound=true; } } } } /** 3. Focus species */ if(SPfound == false) // 2. right : Closed to last of the gene mention { // 1. only the mentions appeared earlier are inferred // if(mention2Location2Species_hash.containsKey(G_mentions.toLowerCase())) { int closed_loca=0; for (int loca_start : mention2Location2Species_hash.get(G_mentions.toLowerCase()).keySet()) { if(loca_startclosed_loca) { closed_loca=loca_start; } } } if(closed_loca>0) { if(GNormPlus.SP_Virus2Human_hash.containsKey(Location2Species_hash.get(closed_loca))) { GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tFocus:"+Location2Species_hash.get(closed_loca)+"&9606"); } else { GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tFocus:"+Location2Species_hash.get(closed_loca)); } } else { if(GNormPlus.SP_Virus2Human_hash.containsKey(MajorSP)) { GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tFocus:"+MajorSP+"&9606"); } else { GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tFocus:"+MajorSP); } } } else { if(GNormPlus.SP_Virus2Human_hash.containsKey(MajorSP)) { GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tFocus:"+MajorSP+"&9606"); } else { GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tFocus:"+MajorSP); } } } } } } } GNormPlus.BioCDocobj.BioCOutput(Filename,FilenameBioC,GNormPlus.BioCDocobj.Annotations,false,true); } public void SpeciesAssignment(String Filename,String FilenameBioC,String FocusSpecies) throws IOException, XMLStreamException { for (int i = 0; i < GNormPlus.BioCDocobj.Annotations.size(); i++) /** PMIDs : i */ { for (int j = 0; j < GNormPlus.BioCDocobj.Annotations.get(i).size(); j++) /** Paragraphs : j */ { for (int k = 0; k < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); k++) // Annotation : k { String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t"); if(anno.length==5) //Species { String id=anno[4].replaceAll("\\*", ""); GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, anno[0]+"\t"+anno[1]+"\t"+anno[2]+"\t"+anno[3]+"\t"+id); } else //Gene : if(anno.length==3) { /** 1. prefix */ boolean SPfound = false; if(GNormPlus.GeneWithoutSPPrefix_hash.containsKey(anno[2].toLowerCase())) { //special case, and no need for prefix - SA } else { Pattern ptmp = Pattern.compile("^("+GNormPlus.PrefixID_hash.get(FocusSpecies)+")([A-Z].*)$"); Matcher mtmp = ptmp.matcher(anno[2]); if(mtmp.find()) { String MentionWoPrefix=mtmp.group(2); GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, anno[0]+"\t"+anno[1]+"\t"+anno[2]+"|"+MentionWoPrefix+"\t"+anno[3]+"\tPrefix:"+FocusSpecies); SPfound=true; } } if(SPfound == false) { GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k)+"\tFocus:"+FocusSpecies); } } } } } GNormPlus.BioCDocobj.BioCOutput(Filename,FilenameBioC,GNormPlus.BioCDocobj.Annotations,false,true); } }