|
|
|
|
|
|
|
|
|
|
|
package GNormPluslib;
|
|
|
|
import bioc.BioCAnnotation;
|
|
import bioc.BioCCollection;
|
|
import bioc.BioCDocument;
|
|
import bioc.BioCLocation;
|
|
import bioc.BioCPassage;
|
|
|
|
import bioc.io.BioCDocumentWriter;
|
|
import bioc.io.BioCFactory;
|
|
import bioc.io.woodstox.ConnectorWoodstox;
|
|
import java.io.BufferedReader;
|
|
import java.io.BufferedWriter;
|
|
import java.io.FileInputStream;
|
|
import java.io.FileOutputStream;
|
|
import java.io.FileReader;
|
|
import java.io.FileWriter;
|
|
import java.io.IOException;
|
|
import java.io.InputStreamReader;
|
|
import java.io.OutputStreamWriter;
|
|
import java.text.BreakIterator;
|
|
import java.time.LocalDate;
|
|
import java.time.ZoneId;
|
|
|
|
import javax.xml.stream.XMLStreamException;
|
|
|
|
import org.tartarus.snowball.SnowballStemmer;
|
|
import org.tartarus.snowball.ext.englishStemmer;
|
|
|
|
import java.util.Map;
|
|
import java.util.regex.Matcher;
|
|
import java.util.regex.Pattern;
|
|
import java.util.ArrayList;
|
|
import java.util.HashMap;
|
|
import java.util.List;
|
|
import java.util.Locale;
|
|
import java.util.Collections;
|
|
|
|
public class SR
|
|
{
|
|
@SuppressWarnings("null")
|
|
public void SpeciesRecognition(String Filename,String FilenameBioC,String StrainFilename,String FilterAntibody) throws IOException, XMLStreamException
|
|
{
|
|
|
|
for (int i = 0; i < GNormPlus.BioCDocobj.PMIDs.size(); i++)
|
|
{
|
|
String Pmid = GNormPlus.BioCDocobj.PMIDs.get(i);
|
|
PrefixTree PT_Genus = new PrefixTree();
|
|
HashMap<String, String> SPID_hash = new HashMap<String, String>();
|
|
ArrayList<String> TargetedLocation = new ArrayList<String>();
|
|
HashMap<String, String> GenusNames = new HashMap<String, String>();
|
|
HashMap<String, String> Mention2ID_lc = new HashMap<String, String>();
|
|
ArrayList<String> IDset = new ArrayList<String>();
|
|
for (int j = 0; j < GNormPlus.BioCDocobj.PassageNames.get(i).size(); j++)
|
|
{
|
|
String PassageContext = GNormPlus.BioCDocobj.PassageContexts.get(i).get(j);
|
|
|
|
|
|
ArrayList<String> locations = GNormPlus.PT_Species.SearchMentionLocation(PassageContext,"Species");
|
|
for (int k = 0 ; k < locations.size() ; k++)
|
|
{
|
|
String anno[]=locations.get(k).split("\t");
|
|
int start= Integer.parseInt(anno[0]);
|
|
int last= Integer.parseInt(anno[1]);
|
|
|
|
|
|
String ForwardSTR="";
|
|
String BackwardSTR="";
|
|
if(start>21)
|
|
{
|
|
ForwardSTR = (PassageContext+"ZZZZZZZZZZZZZZZZZZZZZZZZZZZ").substring(start-21,last);
|
|
}
|
|
else
|
|
{
|
|
ForwardSTR = (PassageContext+"ZZZZZZZZZZZZZZZZZZZZZZZZZZZ").substring(0,last);
|
|
}
|
|
if(PassageContext.length()>last+21)
|
|
{
|
|
BackwardSTR = PassageContext.substring(start,last+21);
|
|
}
|
|
else
|
|
{
|
|
BackwardSTR = PassageContext.substring(start,PassageContext.length());
|
|
}
|
|
|
|
String mention = anno[2];
|
|
String id = anno[3];
|
|
String mention_tmp=mention.toLowerCase();
|
|
mention_tmp = mention_tmp.replaceAll("([^A-Za-z0-9@ ])", "\\\\$1");
|
|
String antibody="";
|
|
if(ForwardSTR.toLowerCase().matches(".*(anti|antibody|antibodies|serum|polyclonal|monoclonal|igg)[\\W\\-\\_]+"+mention_tmp)) {antibody="(anti)";}
|
|
else if(BackwardSTR.toLowerCase().matches(mention_tmp+"[\\W\\-\\_]+(anti|antibody|antibodies|serum|polyclonal|monoclonal|igg).*")){antibody="(anti)";}
|
|
else if(BackwardSTR.toLowerCase().matches(mention_tmp+"[\\W\\-\\_]+[A-Za-z0-9]+[\\W\\-\\_]+(anti|antibody|antibodies|serum|polyclonal|monoclonal|igg).*")){antibody="(anti)";}
|
|
|
|
if(mention.matches(".*[\\(\\[\\{].*") && BackwardSTR.toLowerCase().matches(mention_tmp+"\\).*") )
|
|
{
|
|
last=last+1;
|
|
mention=mention+")";
|
|
}
|
|
|
|
if(BackwardSTR.toLowerCase().matches(mention_tmp+"[0-9].*")){}
|
|
else if((mention.matches(".*[;:,].*")) && mention.length()<=10){}
|
|
else if(mention.matches("to[\\W\\-\\_]+[0-9]+")){}
|
|
else if(mention.matches("[a-z][\\)\\]\\}].*") && (!mention.matches(".*[\\(\\[\\{].*")) && mention.length()<=10){}
|
|
else if(mention.matches(".*[\\(\\[\\{].*") && (!mention.matches(".*[\\)\\]\\}].*")) && mention.length()<=10){}
|
|
else if(!id.equals("NA"))
|
|
{
|
|
if(GNormPlus.BioCDocobj.Annotations.size()>i && GNormPlus.BioCDocobj.Annotations.get(i).size()>j)
|
|
{
|
|
if((!mention.matches("^[A-Za-z] [A-Za-z0-9]+$")) && (mention.length()>=3))
|
|
{
|
|
if(FilterAntibody.equals("False") || (!antibody.equals("(anti)")))
|
|
{
|
|
String patt="^(.+?) [sS]train";
|
|
Pattern ptmp = Pattern.compile(patt);
|
|
Matcher mtmp = ptmp.matcher(mention);
|
|
if(mtmp.find())
|
|
{
|
|
mention=mtmp.group(1);
|
|
last=last-7;
|
|
}
|
|
GNormPlus.BioCDocobj.Annotations.get(i).get(j).add(start+"\t"+last+"\t"+mention+"\tSpecies\t"+id);
|
|
String mentions_tmp=mention.toLowerCase();
|
|
mentions_tmp=mentions_tmp.replaceAll("[\\W\\-\\_]","");
|
|
mentions_tmp=mentions_tmp.replaceAll("[0-9]","0");
|
|
GNormPlus.Filtering_hash.put(mentions_tmp,"");
|
|
Mention2ID_lc.put(mention.toLowerCase(), id);
|
|
|
|
String mention_genus = "";
|
|
patt="^([A-Za-z]+) ";
|
|
ptmp = Pattern.compile(patt);
|
|
mtmp = ptmp.matcher(mention);
|
|
if(mtmp.find())
|
|
{
|
|
mention_genus=mtmp.group(1);
|
|
}
|
|
|
|
IDset.add(id);
|
|
for(int s=start;s<last;s++)
|
|
{
|
|
TargetedLocation.add(j+"\t"+s);
|
|
}
|
|
String ids[]=id.split(";");
|
|
for(int x=0;x<ids.length;x++)
|
|
{
|
|
patt="^\\**([0-9]+)";
|
|
ptmp = Pattern.compile(patt);
|
|
mtmp = ptmp.matcher(ids[x]);
|
|
if(mtmp.find())
|
|
{
|
|
SPID_hash.put(mtmp.group(1), mention_genus);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
locations = GNormPlus.PT_Cell.SearchMentionLocation(PassageContext,"Cell");
|
|
for (int k = 0 ; k < locations.size() ; k++)
|
|
{
|
|
String anno[]=locations.get(k).split("\t");
|
|
int start= Integer.parseInt(anno[0]);
|
|
int last= Integer.parseInt(anno[1]);
|
|
String mention = anno[2];
|
|
String id = anno[3];
|
|
if(GNormPlus.BioCDocobj.Annotations.size()>i && GNormPlus.BioCDocobj.Annotations.get(i).size()>j)
|
|
{
|
|
if(!TargetedLocation.contains(j+"\t"+start))
|
|
{
|
|
int last40=0;
|
|
if(PassageContext.length()>=last+40)
|
|
{
|
|
last40=last+40;
|
|
}
|
|
else
|
|
{
|
|
last40=PassageContext.length();
|
|
}
|
|
|
|
|
|
String ForwardSTR="";
|
|
String BackwardSTR="";
|
|
if(start>21)
|
|
{
|
|
ForwardSTR = PassageContext.substring(start-21,last);
|
|
}
|
|
else
|
|
{
|
|
ForwardSTR = PassageContext.substring(0,last);
|
|
}
|
|
if(PassageContext.length()>last+21)
|
|
{
|
|
BackwardSTR = PassageContext.substring(start,last+21);
|
|
}
|
|
else
|
|
{
|
|
BackwardSTR = PassageContext.substring(start,PassageContext.length());
|
|
}
|
|
String mention_tmp=mention.toLowerCase();
|
|
mention_tmp = mention_tmp.replaceAll("([^A-Za-z0-9@ ])", "\\\\$1");
|
|
if(mention_tmp.matches(".*[\\[\\]\\(\\)\\{\\}].*")){}
|
|
else if(BackwardSTR.toLowerCase().matches(mention_tmp+"[0-9\\-\\_].*")){}
|
|
else if(ForwardSTR.toLowerCase().matches(".*[0-9\\-\\_]"+mention_tmp)){}
|
|
else
|
|
{
|
|
String patt="[\\W\\-]cell([\\- ]*line|)[s]*[\\W\\-]";
|
|
Pattern ptmp = Pattern.compile(patt);
|
|
Matcher mtmp = ptmp.matcher(PassageContext.substring(last, last40).toLowerCase());
|
|
if(mtmp.find())
|
|
{
|
|
if(GNormPlus.taxid4gene.contains(id))
|
|
{
|
|
id="*"+id;
|
|
}
|
|
GNormPlus.BioCDocobj.Annotations.get(i).get(j).add(start+"\t"+last+"\t"+mention+"\tCell\t"+id);
|
|
String mentions_tmp=mention.toLowerCase();
|
|
mentions_tmp=mentions_tmp.replaceAll("[\\W\\-\\_]","");
|
|
mentions_tmp=mentions_tmp.replaceAll("[0-9]","0");
|
|
GNormPlus.Filtering_hash.put(mentions_tmp,"");
|
|
IDset.add(id);
|
|
for(int s=start;s<last;s++)
|
|
{
|
|
TargetedLocation.add(j+"\t"+s);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
for(String ID: SPID_hash.keySet())
|
|
{
|
|
if(GNormPlus.GenusID_hash.containsKey(ID))
|
|
{
|
|
GenusNames.put(ID,GNormPlus.GenusID_hash.get(ID));
|
|
}
|
|
if(SPID_hash.get(ID).length()>=7)
|
|
{
|
|
GenusNames.put(ID,SPID_hash.get(ID));
|
|
}
|
|
}
|
|
}
|
|
|
|
GenusNames.put("3702", "arabidopsis");
|
|
GenusNames.put("4932", "saccharomyces");
|
|
GenusNames.put("562", "escherichia");
|
|
GenusNames.put("7227", "drosophila");
|
|
GenusNames.put("8355", "xenopus");
|
|
|
|
PT_Genus.Hash2Tree(GenusNames);
|
|
|
|
|
|
for (int j = 0; j < GNormPlus.BioCDocobj.PassageNames.get(i).size(); j++)
|
|
{
|
|
if(GNormPlus.BioCDocobj.PassageContexts.size()>i &&
|
|
GNormPlus.BioCDocobj.PassageContexts.get(i).size()>j &&
|
|
GNormPlus.BioCDocobj.Annotations.size()>i &&
|
|
GNormPlus.BioCDocobj.Annotations.get(i).size()>j
|
|
)
|
|
{
|
|
String PassageContext = GNormPlus.BioCDocobj.PassageContexts.get(i).get(j);
|
|
ArrayList<String> locations_Genus = PT_Genus.SearchMentionLocation(PassageContext,"Genus");
|
|
for (int k = 0 ; k < locations_Genus.size() ; k++)
|
|
{
|
|
String anno[]=locations_Genus.get(k).split("\t");
|
|
String start= anno[0];
|
|
String last= anno[1];
|
|
String mention = anno[2];
|
|
String id = anno[3];
|
|
if(!TargetedLocation.contains(j+"\t"+start))
|
|
{
|
|
String patt="^\\**([0-9]+)$";
|
|
Pattern ptmp = Pattern.compile(patt);
|
|
Matcher mtmp = ptmp.matcher(id);
|
|
if(mtmp.find())
|
|
{
|
|
id = mtmp.group(1);
|
|
}
|
|
|
|
if(GNormPlus.taxid4gene.contains(id))
|
|
{
|
|
id="*"+id;
|
|
}
|
|
GNormPlus.BioCDocobj.Annotations.get(i).get(j).add(start+"\t"+last+"\t"+mention+"\tGenus\t"+id);
|
|
String mentions_tmp=mention.toLowerCase();
|
|
mentions_tmp=mentions_tmp.replaceAll("[\\W\\-\\_]","");
|
|
mentions_tmp=mentions_tmp.replaceAll("[0-9]","0");
|
|
GNormPlus.Filtering_hash.put(mentions_tmp,"");
|
|
IDset.add(id);
|
|
for(int s=Integer.parseInt(start);s<Integer.parseInt(last);s++)
|
|
{
|
|
TargetedLocation.add(j+"\t"+s);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
PrefixTree PT_Strain = new PrefixTree();
|
|
HashMap<String, String> StrainID_hash = new HashMap<String, String>();
|
|
BufferedReader br = new BufferedReader(new FileReader(StrainFilename));
|
|
String line="";
|
|
while ((line = br.readLine()) != null)
|
|
{
|
|
String l[]=line.split("\t");
|
|
String ancestor = l[0];
|
|
String tax_id = l[1];
|
|
String tax_names = l[2];
|
|
if(SPID_hash.containsKey(ancestor))
|
|
{
|
|
StrainID_hash.put(tax_id, tax_names);
|
|
}
|
|
else if(SPID_hash.containsKey(tax_id))
|
|
{
|
|
StrainID_hash.put(tax_id, tax_names);
|
|
}
|
|
}
|
|
br.close();
|
|
HashMap<String, String> StrainNames = new HashMap<String, String>();
|
|
for(String ID: StrainID_hash.keySet())
|
|
{
|
|
StrainNames.put(ID,StrainID_hash.get(ID));
|
|
}
|
|
|
|
PT_Strain.Hash2Tree(StrainNames);
|
|
|
|
|
|
for (int j = 0; j < GNormPlus.BioCDocobj.PassageNames.get(i).size(); j++)
|
|
{
|
|
if(GNormPlus.BioCDocobj.PassageContexts.size()>i &&
|
|
GNormPlus.BioCDocobj.PassageContexts.get(i).size()>j &&
|
|
GNormPlus.BioCDocobj.Annotations.size()>i &&
|
|
GNormPlus.BioCDocobj.Annotations.get(i).size()>j
|
|
)
|
|
{
|
|
String PassageContext = GNormPlus.BioCDocobj.PassageContexts.get(i).get(j);
|
|
ArrayList<String> locations_Strain = PT_Strain.SearchMentionLocation(PassageContext,"Strain");
|
|
for (int k = 0 ; k < locations_Strain.size() ; k++)
|
|
{
|
|
String anno[]=locations_Strain.get(k).split("\t");
|
|
String start= anno[0];
|
|
String last= anno[1];
|
|
String mention = anno[2];
|
|
String id = anno[3];
|
|
if(!TargetedLocation.contains(j+"\t"+start))
|
|
{
|
|
if((!mention.matches(".*[;,\\{\\}\\(\\)\\[\\]].*")) && !mention.matches("[a-z]{1,4} [0-9]{1,3}"))
|
|
{
|
|
if(GNormPlus.taxid4gene.contains(id))
|
|
{
|
|
id="*"+id;
|
|
}
|
|
GNormPlus.BioCDocobj.Annotations.get(i).get(j).add(start+"\t"+last+"\t"+mention+"\tStrain\t"+id);
|
|
String mentions_tmp=mention.toLowerCase();
|
|
mentions_tmp=mentions_tmp.replaceAll("[\\W\\-\\_]","");
|
|
mentions_tmp=mentions_tmp.replaceAll("[0-9]","0");
|
|
GNormPlus.Filtering_hash.put(mentions_tmp,"");
|
|
IDset.add(id);
|
|
for(int s=Integer.parseInt(start);s<Integer.parseInt(last);s++)
|
|
{
|
|
TargetedLocation.add(j+"\t"+s);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
HashMap<String, String> OtherNames = new HashMap<String, String>();
|
|
for(String men : Mention2ID_lc.keySet())
|
|
{
|
|
String men_id= Mention2ID_lc.get(men);
|
|
if(GNormPlus.PmidLF2Abb_lc_hash.containsKey(Pmid+"\t"+men))
|
|
{
|
|
String Abb = GNormPlus.PmidLF2Abb_lc_hash.get(Pmid+"\t"+men);
|
|
|
|
if(OtherNames.containsKey(men_id))
|
|
{
|
|
OtherNames.put(men_id, OtherNames.get(men_id)+"|"+Abb);
|
|
}
|
|
else
|
|
{
|
|
OtherNames.put(men_id,Abb);
|
|
}
|
|
}
|
|
String men_nospace=men.replaceAll(" ", "");
|
|
|
|
if(OtherNames.containsKey(men_id))
|
|
{
|
|
OtherNames.put(men_id, OtherNames.get(men_id)+"|"+men_nospace);
|
|
}
|
|
else
|
|
{
|
|
OtherNames.put(men_id,men_nospace);
|
|
}
|
|
}
|
|
PrefixTree PT_Others = new PrefixTree();
|
|
PT_Others.Hash2Tree(OtherNames);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for (int j = 0; j < GNormPlus.BioCDocobj.PassageNames.get(i).size(); j++)
|
|
{
|
|
if(GNormPlus.BioCDocobj.PassageContexts.size()>i &&
|
|
GNormPlus.BioCDocobj.PassageContexts.get(i).size()>j &&
|
|
GNormPlus.BioCDocobj.Annotations.size()>i &&
|
|
GNormPlus.BioCDocobj.Annotations.get(i).size()>j
|
|
)
|
|
{
|
|
String PassageContext = GNormPlus.BioCDocobj.PassageContexts.get(i).get(j);
|
|
ArrayList<String> locations_Abb = PT_Others.SearchMentionLocation(PassageContext,"Species");
|
|
for (int k = 0 ; k < locations_Abb.size() ; k++)
|
|
{
|
|
String anno[]=locations_Abb.get(k).split("\t");
|
|
String start= anno[0];
|
|
String last= anno[1];
|
|
String mention = anno[2];
|
|
String id = anno[3];
|
|
if(!TargetedLocation.contains(j+"\t"+start))
|
|
{
|
|
if(GNormPlus.taxid4gene.contains(id))
|
|
{
|
|
id="*"+id;
|
|
}
|
|
GNormPlus.BioCDocobj.Annotations.get(i).get(j).add(start+"\t"+last+"\t"+mention+"\tSpecies\t"+id);
|
|
String mentions_tmp=mention.toLowerCase();
|
|
mentions_tmp=mentions_tmp.replaceAll("[\\W\\-\\_]","");
|
|
mentions_tmp=mentions_tmp.replaceAll("[0-9]","0");
|
|
GNormPlus.Filtering_hash.put(mentions_tmp,"");
|
|
Mention2ID_lc.put(mention.toLowerCase(), id);
|
|
IDset.add(id);
|
|
for(int s=Integer.parseInt(start);s<Integer.parseInt(last);s++)
|
|
{
|
|
TargetedLocation.add(j+"\t"+s);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
for (int j = 0; j < GNormPlus.BioCDocobj.PassageNames.get(i).size(); j++)
|
|
{
|
|
if(GNormPlus.BioCDocobj.PassageContexts.size()>i && GNormPlus.BioCDocobj.PassageContexts.get(i).size()>j && GNormPlus.BioCDocobj.Annotations.size()>i && GNormPlus.BioCDocobj.Annotations.get(i).size()>j)
|
|
{
|
|
ArrayList <Integer> remove_anno = new ArrayList <Integer>();
|
|
for (int a = 0; a < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); a++)
|
|
{
|
|
String SpAnno[]=GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(a).split("\t");
|
|
String start= SpAnno[0];
|
|
String last= SpAnno[1];
|
|
String mention = SpAnno[2];
|
|
String type = SpAnno[3];
|
|
|
|
if(type.matches("Gene|FamilyName"))
|
|
{
|
|
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(a,start+"\t"+last+"\t"+mention+"\t"+type);
|
|
}
|
|
else if(type.matches("Species|Genus|Strain|Cell") && SpAnno.length==5)
|
|
{
|
|
|
|
|
|
if(GNormPlus.PmidAbb2LF_lc_hash.containsKey(Pmid+"\t"+mention.toLowerCase()) && Mention2ID_lc.containsKey(GNormPlus.PmidAbb2LF_lc_hash.containsKey(Pmid+"\t"+mention.toLowerCase())))
|
|
{
|
|
String LF_lc=GNormPlus.PmidAbb2LF_lc_hash.get(Pmid+"\t"+mention.toLowerCase());
|
|
if(Mention2ID_lc.containsKey(LF_lc))
|
|
{
|
|
String LF_ID=Mention2ID_lc.get(LF_lc);
|
|
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(a, start+"\t"+last+"\t"+mention+"\t"+type+"\t"+LF_ID);
|
|
String mentions_tmp=mention.toLowerCase();
|
|
mentions_tmp=mentions_tmp.replaceAll("[\\W\\-\\_]","");
|
|
mentions_tmp=mentions_tmp.replaceAll("[0-9]","0");
|
|
GNormPlus.Filtering_hash.put(mentions_tmp,"");
|
|
}
|
|
}
|
|
else if (SpAnno.length>4)
|
|
{
|
|
String id = SpAnno[4];
|
|
String id_split[]=id.split(";");
|
|
if(id_split.length>=2)
|
|
{
|
|
|
|
boolean found=false;
|
|
for(int x=0;x<IDset.size();x++)
|
|
{
|
|
String id_tmp= IDset.get(x);
|
|
for(int y=0;y<id_split.length;y++)
|
|
{
|
|
if(id_split[y].equals(id_tmp))
|
|
{
|
|
found=true;
|
|
}
|
|
}
|
|
if(found == true)
|
|
{
|
|
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(a, start+"\t"+last+"\t"+mention+"\t"+type+"\t"+id_tmp);
|
|
String mentions_tmp=mention.toLowerCase();
|
|
mentions_tmp=mentions_tmp.replaceAll("[\\W\\-\\_]","");
|
|
mentions_tmp=mentions_tmp.replaceAll("[0-9]","0");
|
|
GNormPlus.Filtering_hash.put(mentions_tmp,"");
|
|
x=1000000;
|
|
}
|
|
}
|
|
|
|
|
|
if(found == false)
|
|
{
|
|
int min=10000000;
|
|
String min_id="";
|
|
for(int y=0;y<id_split.length;y++)
|
|
{
|
|
String id_tmp = id_split[y];
|
|
String patt="^\\**([0-9]+)";
|
|
Pattern ptmp = Pattern.compile(patt);
|
|
Matcher mtmp = ptmp.matcher(id_tmp);
|
|
if(mtmp.find())
|
|
{
|
|
id_tmp = mtmp.group(1);
|
|
}
|
|
|
|
if(y==0)
|
|
{
|
|
min_id=id_split[y];
|
|
min=Integer.parseInt(id_tmp);
|
|
}
|
|
else if(Integer.parseInt(id_tmp)<min)
|
|
{
|
|
min=Integer.parseInt(id_tmp);
|
|
min_id=id_tmp;
|
|
}
|
|
}
|
|
if(GNormPlus.taxid4gene.contains(min_id))
|
|
{
|
|
min_id="*"+min_id;
|
|
}
|
|
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(a,start+"\t"+last+"\t"+mention+"\tSpecies\t"+min_id);
|
|
String mentions_tmp=mention.toLowerCase();
|
|
mentions_tmp=mentions_tmp.replaceAll("[\\W\\-\\_]","");
|
|
mentions_tmp=mentions_tmp.replaceAll("[0-9]","0");
|
|
GNormPlus.Filtering_hash.put(mentions_tmp,"");
|
|
}
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
remove_anno.add(a);
|
|
}
|
|
}
|
|
|
|
Collections.sort(remove_anno);
|
|
for (int counter = remove_anno.size()-1; counter >= 0 ; counter--)
|
|
{
|
|
int ai=remove_anno.get(counter);
|
|
|
|
GNormPlus.BioCDocobj.Annotations.get(i).get(j).remove(ai);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
GNormPlus.BioCDocobj.BioCOutput(Filename,FilenameBioC,GNormPlus.BioCDocobj.Annotations,false,true);
|
|
}
|
|
public void SpeciesAssignment(String Filename,String FilenameBioC) throws IOException, XMLStreamException
|
|
{
|
|
GNormPlus.BioCDocobj.Annotations = new ArrayList();
|
|
GNormPlus.BioCDocobj.BioCReaderWithAnnotation(Filename);
|
|
|
|
BreakIterator iterator = BreakIterator.getSentenceInstance(Locale.US);
|
|
for (int i = 0; i < GNormPlus.BioCDocobj.Annotations.size(); i++)
|
|
{
|
|
HashMap<String, String> PrefixIDTarget_hash = new HashMap<String, String>();
|
|
PrefixIDTarget_hash.put("9606", "h");
|
|
PrefixIDTarget_hash.put("10090", "m");
|
|
PrefixIDTarget_hash.put("10116", "r");
|
|
PrefixIDTarget_hash.put("4932", "y");
|
|
PrefixIDTarget_hash.put("7227", "d");
|
|
PrefixIDTarget_hash.put("7955", "z|zf|Zf|dr|Dr");
|
|
PrefixIDTarget_hash.put("3702", "at|At");
|
|
|
|
HashMap<String, Double> SP2Num_hash = new HashMap<String, Double>();
|
|
for (int j = 0; j < GNormPlus.BioCDocobj.Annotations.get(i).size(); j++)
|
|
{
|
|
for (int k = 0; k < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); k++)
|
|
{
|
|
String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
|
|
if(anno.length==5)
|
|
{
|
|
String patt="^\\**([0-9]+)$";
|
|
Pattern ptmp = Pattern.compile(patt);
|
|
Matcher mtmp = ptmp.matcher(anno[4]);
|
|
if(mtmp.find())
|
|
{
|
|
String id = mtmp.group(1);
|
|
|
|
if(!PrefixIDTarget_hash.containsKey(id))
|
|
{
|
|
PrefixIDTarget_hash.put(id,GNormPlus.PrefixID_hash.get(id));
|
|
}
|
|
if(j == 0)
|
|
{
|
|
if(SP2Num_hash.containsKey(id))
|
|
{
|
|
SP2Num_hash.put(id, SP2Num_hash.get(id)+2);
|
|
}
|
|
else
|
|
{
|
|
if(GNormPlus.TaxFreq_hash.containsKey(id))
|
|
{
|
|
SP2Num_hash.put(id, GNormPlus.TaxFreq_hash.get(id)+2);
|
|
}
|
|
else
|
|
{
|
|
SP2Num_hash.put(id, 2.0);
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
}
|
|
else
|
|
{
|
|
if(SP2Num_hash.containsKey(id))
|
|
{
|
|
SP2Num_hash.put(id, SP2Num_hash.get(id)+1);
|
|
}
|
|
else
|
|
{
|
|
if(GNormPlus.TaxFreq_hash.containsKey(id))
|
|
{
|
|
SP2Num_hash.put(id, 1 + GNormPlus.TaxFreq_hash.get(id));
|
|
}
|
|
else
|
|
{
|
|
SP2Num_hash.put(id, 1.0);
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
String MajorSP="9606";
|
|
double MaxSP=0;
|
|
for(String tid : SP2Num_hash.keySet())
|
|
{
|
|
if(SP2Num_hash.get(tid)>MaxSP)
|
|
{
|
|
MajorSP=tid;
|
|
MaxSP=SP2Num_hash.get(tid);
|
|
}
|
|
}
|
|
|
|
for (int j = 0; j < GNormPlus.BioCDocobj.PassageContexts.get(i).size(); j++)
|
|
{
|
|
String PassageContext = GNormPlus.BioCDocobj.PassageContexts.get(i).get(j);
|
|
|
|
iterator.setText(PassageContext);
|
|
ArrayList<Integer> Sentence_offsets = new ArrayList<Integer>();
|
|
int Sent_start = iterator.first();
|
|
for (int Sent_last = iterator.next(); Sent_last != BreakIterator.DONE; Sent_start = Sent_last, Sent_last = iterator.next())
|
|
{
|
|
Sentence_offsets.add(Sent_start);
|
|
}
|
|
|
|
HashMap<Integer,String> Annotations_Gene_hash = new HashMap<Integer,String>();
|
|
ArrayList<String> Annotations_Species = new ArrayList<String>();
|
|
if(GNormPlus.BioCDocobj.Annotations.get(i).size()>j)
|
|
{
|
|
for (int k = 0; k < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); k++)
|
|
{
|
|
String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
|
|
if(anno.length==5)
|
|
{
|
|
Annotations_Species.add(GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k));
|
|
}
|
|
else
|
|
{
|
|
|
|
Annotations_Gene_hash.put(k,GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k));
|
|
}
|
|
}
|
|
|
|
|
|
HashMap<String,HashMap<Integer,String>> mention2Location2Species_hash = new HashMap<String,HashMap<Integer,String>>();
|
|
HashMap<Integer,String> Location2Species_hash = new HashMap<Integer,String>();
|
|
for (int k : Annotations_Gene_hash.keySet())
|
|
{
|
|
boolean SPfound = false;
|
|
String anno[] = Annotations_Gene_hash.get(k).split("\t");
|
|
int G_Start= Integer.parseInt(anno[0]);
|
|
int G_Last= Integer.parseInt(anno[1]);
|
|
String G_mentions = anno[2];
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int Target_Sentence=0;
|
|
if(SPfound == false)
|
|
{
|
|
for(int s=0;s<Sentence_offsets.size();s++)
|
|
|
|
{
|
|
int Sentence_last=1000000;
|
|
if(s<Sentence_offsets.size()-1)
|
|
{
|
|
Sentence_last=Sentence_offsets.get(s+1);
|
|
}
|
|
if(G_Start<Sentence_last)
|
|
{
|
|
Target_Sentence=s;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
int Sentence_Start = Sentence_offsets.get(Target_Sentence);
|
|
int Sentence_Last = 1000000;
|
|
if(Sentence_offsets.size() > Target_Sentence+1){ Sentence_Last = Sentence_offsets.get(Target_Sentence+1); }
|
|
if(SPfound == false)
|
|
{
|
|
int closet_Sp_Start=0;
|
|
for(int sp=0;sp<Annotations_Species.size();sp++)
|
|
{
|
|
String AnnoSp[]=Annotations_Species.get(sp).split("\t");
|
|
int Sp_Start = Integer.parseInt(AnnoSp[0]);
|
|
String patt="^\\**([0-9]+)$";
|
|
Pattern ptmp = Pattern.compile(patt);
|
|
Matcher mtmp = ptmp.matcher(AnnoSp[4]);
|
|
if(mtmp.find())
|
|
{
|
|
String taxid = mtmp.group(1);
|
|
Location2Species_hash.put(Sp_Start,taxid);
|
|
if(Sp_Start <= G_Start && Sp_Start >= Sentence_Start && Sp_Start >closet_Sp_Start)
|
|
{
|
|
closet_Sp_Start=Sp_Start;
|
|
Location2Species_hash.put(Integer.parseInt(anno[0]), taxid);
|
|
|
|
if(mention2Location2Species_hash.containsKey(G_mentions.toLowerCase()))
|
|
{
|
|
mention2Location2Species_hash.get(G_mentions.toLowerCase()).put(Integer.parseInt(anno[0]), taxid);
|
|
}
|
|
else
|
|
{
|
|
mention2Location2Species_hash.put(G_mentions.toLowerCase(),Location2Species_hash);
|
|
}
|
|
|
|
SPfound=true;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if(SPfound == false)
|
|
{
|
|
int closet_Sp_Last=1000000;
|
|
for(int sp=0;sp<Annotations_Species.size();sp++)
|
|
{
|
|
String AnnoSp[]=Annotations_Species.get(sp).split("\t");
|
|
int Sp_Last = Integer.parseInt(AnnoSp[1]);
|
|
String patt="^\\**([0-9]+)$";
|
|
Pattern ptmp = Pattern.compile(patt);
|
|
Matcher mtmp = ptmp.matcher(AnnoSp[4]);
|
|
if(mtmp.find())
|
|
{
|
|
String taxid = mtmp.group(1);
|
|
if(Sp_Last >= G_Last && Sp_Last <= Sentence_Last && Sp_Last < closet_Sp_Last)
|
|
{
|
|
closet_Sp_Last=Sp_Last;
|
|
Location2Species_hash.put(Integer.parseInt(anno[0]), taxid);
|
|
|
|
if(mention2Location2Species_hash.containsKey(G_mentions.toLowerCase()))
|
|
{
|
|
mention2Location2Species_hash.get(G_mentions.toLowerCase()).put(Integer.parseInt(anno[0]), taxid);
|
|
}
|
|
else
|
|
{
|
|
mention2Location2Species_hash.put(G_mentions.toLowerCase(),Location2Species_hash);
|
|
}
|
|
|
|
SPfound=true;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
for (int k : Annotations_Gene_hash.keySet())
|
|
{
|
|
String anno[] = Annotations_Gene_hash.get(k).split("\t");
|
|
int G_Start= Integer.parseInt(anno[0]);
|
|
int G_Last= Integer.parseInt(anno[1]);
|
|
String G_mentions = anno[2];
|
|
String G_type = anno[3];
|
|
String G_mention_list[]=G_mentions.split("\\|");
|
|
String G_mention=G_mention_list[0];
|
|
|
|
|
|
boolean SPfound = false;
|
|
for(String taxid: PrefixIDTarget_hash.keySet())
|
|
{
|
|
if(GNormPlus.GeneWithoutSPPrefix_hash.containsKey(G_mention.toLowerCase()))
|
|
{
|
|
|
|
}
|
|
else
|
|
{
|
|
Pattern ptmp = Pattern.compile("^("+PrefixIDTarget_hash.get(taxid)+")([A-Z].*)$");
|
|
Matcher mtmp = ptmp.matcher(G_mention);
|
|
if(mtmp.find())
|
|
{
|
|
String MentionWoPrefix=mtmp.group(2);
|
|
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, anno[0]+"\t"+anno[1]+"\t"+anno[2]+"|"+MentionWoPrefix+"\t"+anno[3]+"\tPrefix:"+taxid);
|
|
SPfound=true;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int Target_Sentence=0;
|
|
if(SPfound == false)
|
|
{
|
|
for(int s=0;s<Sentence_offsets.size();s++)
|
|
|
|
{
|
|
int Sentence_last=1000000;
|
|
if(s<Sentence_offsets.size()-1)
|
|
{
|
|
Sentence_last=Sentence_offsets.get(s+1);
|
|
}
|
|
if(G_Start<Sentence_last)
|
|
{
|
|
Target_Sentence=s;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
int Sentence_Start = Sentence_offsets.get(Target_Sentence);
|
|
int Sentence_Last = 1000000;
|
|
if(Sentence_offsets.size() > Target_Sentence+1){ Sentence_Last = Sentence_offsets.get(Target_Sentence+1); }
|
|
if(SPfound == false)
|
|
{
|
|
int closet_Sp_Start=0;
|
|
for(int sp=0;sp<Annotations_Species.size();sp++)
|
|
{
|
|
String AnnoSp[]=Annotations_Species.get(sp).split("\t");
|
|
int Sp_Start = Integer.parseInt(AnnoSp[0]);
|
|
String patt="^\\**([0-9]+)$";
|
|
Pattern ptmp = Pattern.compile(patt);
|
|
Matcher mtmp = ptmp.matcher(AnnoSp[4]);
|
|
if(mtmp.find())
|
|
{
|
|
String taxid = mtmp.group(1);
|
|
if(Sp_Start <= G_Start && Sp_Start >= Sentence_Start && Sp_Start >closet_Sp_Start)
|
|
{
|
|
closet_Sp_Start=Sp_Start;
|
|
if(GNormPlus.SP_Virus2Human_hash.containsKey(taxid))
|
|
{
|
|
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tLeft:"+taxid+"&9606");
|
|
}
|
|
else
|
|
{
|
|
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tLeft:"+taxid);
|
|
}
|
|
SPfound=true;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if(SPfound == false)
|
|
{
|
|
int closet_Sp_Last=1000000;
|
|
for(int sp=0;sp<Annotations_Species.size();sp++)
|
|
{
|
|
String AnnoSp[]=Annotations_Species.get(sp).split("\t");
|
|
int Sp_Last = Integer.parseInt(AnnoSp[1]);
|
|
String patt="^\\**([0-9]+)$";
|
|
Pattern ptmp = Pattern.compile(patt);
|
|
Matcher mtmp = ptmp.matcher(AnnoSp[4]);
|
|
if(mtmp.find())
|
|
{
|
|
String taxid = mtmp.group(1);
|
|
if(Sp_Last >= G_Last && Sp_Last <= Sentence_Last && Sp_Last < closet_Sp_Last)
|
|
{
|
|
closet_Sp_Last=Sp_Last;
|
|
if(GNormPlus.SP_Virus2Human_hash.containsKey(taxid))
|
|
{
|
|
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tRight:"+taxid+"&9606");
|
|
}
|
|
else
|
|
{
|
|
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tRight:"+taxid);
|
|
}
|
|
SPfound=true;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
if(SPfound == false)
|
|
{
|
|
|
|
|
|
if(mention2Location2Species_hash.containsKey(G_mentions.toLowerCase()))
|
|
{
|
|
int closed_loca=0;
|
|
for (int loca_start : mention2Location2Species_hash.get(G_mentions.toLowerCase()).keySet())
|
|
{
|
|
if(loca_start<G_Start)
|
|
{
|
|
if(loca_start>closed_loca)
|
|
{
|
|
closed_loca=loca_start;
|
|
}
|
|
}
|
|
}
|
|
if(closed_loca>0)
|
|
{
|
|
if(GNormPlus.SP_Virus2Human_hash.containsKey(Location2Species_hash.get(closed_loca)))
|
|
{
|
|
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tFocus:"+Location2Species_hash.get(closed_loca)+"&9606");
|
|
}
|
|
else
|
|
{
|
|
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tFocus:"+Location2Species_hash.get(closed_loca));
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if(GNormPlus.SP_Virus2Human_hash.containsKey(MajorSP))
|
|
{
|
|
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tFocus:"+MajorSP+"&9606");
|
|
}
|
|
else
|
|
{
|
|
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tFocus:"+MajorSP);
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if(GNormPlus.SP_Virus2Human_hash.containsKey(MajorSP))
|
|
{
|
|
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tFocus:"+MajorSP+"&9606");
|
|
}
|
|
else
|
|
{
|
|
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tFocus:"+MajorSP);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
GNormPlus.BioCDocobj.BioCOutput(Filename,FilenameBioC,GNormPlus.BioCDocobj.Annotations,false,true);
|
|
}
|
|
public void SpeciesAssignment(String Filename,String FilenameBioC,String FocusSpecies) throws IOException, XMLStreamException
|
|
{
|
|
for (int i = 0; i < GNormPlus.BioCDocobj.Annotations.size(); i++)
|
|
{
|
|
for (int j = 0; j < GNormPlus.BioCDocobj.Annotations.get(i).size(); j++)
|
|
{
|
|
for (int k = 0; k < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); k++)
|
|
{
|
|
String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t");
|
|
if(anno.length==5)
|
|
{
|
|
String id=anno[4].replaceAll("\\*", "");
|
|
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, anno[0]+"\t"+anno[1]+"\t"+anno[2]+"\t"+anno[3]+"\t"+id);
|
|
}
|
|
else
|
|
{
|
|
|
|
boolean SPfound = false;
|
|
if(GNormPlus.GeneWithoutSPPrefix_hash.containsKey(anno[2].toLowerCase()))
|
|
{
|
|
|
|
}
|
|
else
|
|
{
|
|
Pattern ptmp = Pattern.compile("^("+GNormPlus.PrefixID_hash.get(FocusSpecies)+")([A-Z].*)$");
|
|
Matcher mtmp = ptmp.matcher(anno[2]);
|
|
if(mtmp.find())
|
|
{
|
|
String MentionWoPrefix=mtmp.group(2);
|
|
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, anno[0]+"\t"+anno[1]+"\t"+anno[2]+"|"+MentionWoPrefix+"\t"+anno[3]+"\tPrefix:"+FocusSpecies);
|
|
SPfound=true;
|
|
}
|
|
}
|
|
if(SPfound == false)
|
|
{
|
|
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k)+"\tFocus:"+FocusSpecies);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
GNormPlus.BioCDocobj.BioCOutput(Filename,FilenameBioC,GNormPlus.BioCDocobj.Annotations,false,true);
|
|
}
|
|
} |