|
|
|
|
|
|
|
|
|
|
|
package GNormPluslib; |
|
|
|
import bioc.BioCAnnotation; |
|
import bioc.BioCCollection; |
|
import bioc.BioCDocument; |
|
import bioc.BioCLocation; |
|
import bioc.BioCPassage; |
|
|
|
import bioc.io.BioCDocumentWriter; |
|
import bioc.io.BioCFactory; |
|
import bioc.io.woodstox.ConnectorWoodstox; |
|
import java.io.BufferedReader; |
|
import java.io.BufferedWriter; |
|
import java.io.FileInputStream; |
|
import java.io.FileOutputStream; |
|
import java.io.FileReader; |
|
import java.io.FileWriter; |
|
import java.io.IOException; |
|
import java.io.InputStreamReader; |
|
import java.io.OutputStreamWriter; |
|
import java.text.BreakIterator; |
|
import java.time.LocalDate; |
|
import java.time.ZoneId; |
|
import java.text.DecimalFormat; |
|
import java.math.RoundingMode; |
|
|
|
import javax.xml.stream.XMLStreamException; |
|
|
|
import java.util.Map; |
|
import java.util.regex.Matcher; |
|
import java.util.regex.Pattern; |
|
import java.util.ArrayList; |
|
import java.util.HashMap; |
|
import java.util.List; |
|
import java.util.Locale; |
|
|
|
public class GN |
|
{ |
|
public static HashMap<String, String> MatchedTokens_hash = new HashMap<String, String>(); |
|
private double ScoringFunction(String geneid,HashMap<String,String> Mention_hash,String LF) |
|
{ |
|
|
|
|
|
|
|
|
|
|
|
LF = LF.toLowerCase(); |
|
LF = LF.replaceAll("([0-9])([a-z])", "$1 $2"); |
|
LF = LF.replaceAll("([a-z])([0-9])", "$1 $2"); |
|
LF = LF.replaceAll("([\\W\\-\\_])", " "); |
|
LF = LF.replaceAll("[ ]+", " "); |
|
String LF_tkn[]=LF.split(" "); |
|
int LF_ParticalMatch = 0; |
|
|
|
Pattern ptmp = Pattern.compile("[0-9]+\\-([0-9]+)"); |
|
Matcher mtmp = ptmp.matcher(geneid); |
|
Pattern ptmp2 = Pattern.compile("([0-9]+)"); |
|
Matcher mtmp2 = ptmp.matcher(geneid); |
|
if(mtmp.find()) |
|
{ |
|
geneid = "Homo:"+mtmp.group(1); |
|
} |
|
else |
|
{ |
|
geneid = "Gene:"+geneid; |
|
} |
|
|
|
if(GNormPlus.GeneScoring_hash.containsKey(geneid)) |
|
{ |
|
HashMap<String,Double> TF = new HashMap<String,Double>(); |
|
HashMap<String,Double> TermFrequency = new HashMap<String,Double>(); |
|
|
|
|
|
|
|
|
|
String l[]=GNormPlus.GeneScoring_hash.get(geneid).split("\t"); |
|
String tkns_Gene[] = l[0].split(","); |
|
for(int i=0;i<tkns_Gene.length;i++) |
|
{ |
|
String Tkn_Freq[] = tkns_Gene[i].split("-"); |
|
TermFrequency.put(Tkn_Freq[0], Double.parseDouble(Tkn_Freq[1])); |
|
} |
|
Double Cj = Double.parseDouble(l[1]); |
|
Double AllTknNum = Double.parseDouble(l[2]); |
|
|
|
|
|
Double Norm = Double.parseDouble(l[5]); |
|
if(Norm == 0.0){Norm=1.0;} |
|
|
|
|
|
|
|
|
|
for(String Mention : Mention_hash.keySet()) |
|
{ |
|
Mention = Mention.toLowerCase(); |
|
Mention = Mention.replaceAll("([0-9])([a-z])", "$1 $2"); |
|
Mention = Mention.replaceAll("([a-z])([0-9])", "$1 $2"); |
|
Mention = Mention.replaceAll("([\\W\\-\\_])", " "); |
|
Mention = Mention.replaceAll("[ ]+", " "); |
|
String tkns_Mention[]=Mention.split(" "); |
|
for(int i=0;i<tkns_Mention.length;i++) |
|
{ |
|
if(TermFrequency.containsKey(tkns_Mention[i])) |
|
{ |
|
TF.put(tkns_Mention[i], TermFrequency.get(tkns_Mention[i])); |
|
} |
|
} |
|
} |
|
|
|
Double score=0.0; |
|
for(String Tkn : TF.keySet()) |
|
{ |
|
|
|
for(int t=0;t<LF_tkn.length;t++) |
|
{ |
|
if(LF_tkn[t].equals(Tkn)) |
|
{ |
|
LF_ParticalMatch++; |
|
} |
|
} |
|
|
|
double TFij = TF.get(Tkn)/AllTknNum; |
|
double IDFi=GNormPlus.GeneScoringDF_hash.get(Tkn); |
|
score=score+TFij*IDFi*(1/(1-TFij)); |
|
} |
|
|
|
if(LF_ParticalMatch>0){score = score + LF_ParticalMatch;} |
|
return score; |
|
} |
|
else |
|
{ |
|
|
|
return 0.0; |
|
} |
|
} |
|
|
|
public void PreProcessing4GN(String Filename,String FilenameBioC) throws IOException, XMLStreamException |
|
{ |
|
for (int i = 0; i < GNormPlus.BioCDocobj.Annotations.size(); i++) |
|
{ |
|
for (int j = 0; j < GNormPlus.BioCDocobj.Annotations.get(i).size(); j++) |
|
{ |
|
for (int k = 0; k < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); k++) |
|
{ |
|
String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t"); |
|
String start=anno[0]; |
|
String last=anno[1]; |
|
String mentions=anno[2]; |
|
String type=anno[3]; |
|
String id=""; |
|
if(anno.length>=5) |
|
{ |
|
id=anno[4]; |
|
} |
|
|
|
if(type.equals("Gene")) |
|
{ |
|
String mentionArr[] = mentions.split("\\|"); |
|
boolean update=false; |
|
for(int m=0;m<mentionArr.length;m++) |
|
{ |
|
Pattern ptmp = Pattern.compile("^(.*[0-9A-Z])[ ]*p$"); |
|
Matcher mtmp = ptmp.matcher(mentionArr[m]); |
|
Pattern ptmp2 = Pattern.compile("^(.+)nu$"); |
|
Matcher mtmp2 = ptmp2.matcher(mentionArr[m]); |
|
Pattern ptmp3 = Pattern.compile("^(.*)alpha(.*)$"); |
|
Matcher mtmp3 = ptmp3.matcher(mentionArr[m]); |
|
Pattern ptmp4 = Pattern.compile("^(.*)beta(.*)$"); |
|
Matcher mtmp4 = ptmp4.matcher(mentionArr[m]); |
|
Pattern ptmp5 = Pattern.compile("^(.+[0-9])a$"); |
|
Matcher mtmp5 = ptmp5.matcher(mentionArr[m]); |
|
Pattern ptmp6 = Pattern.compile("^(.+[0-9])b$"); |
|
Matcher mtmp6 = ptmp6.matcher(mentionArr[m]); |
|
Pattern ptmp7 = Pattern.compile("^(.+)II([a-z])$"); |
|
Matcher mtmp7 = ptmp7.matcher(mentionArr[m]); |
|
Pattern ptmp8 = Pattern.compile("^(.+)III([a-z])$"); |
|
Matcher mtmp8 = ptmp8.matcher(mentionArr[m]); |
|
if(mtmp.find()) |
|
{ |
|
mentions=mentions+"|"+mtmp.group(1); |
|
update=true; |
|
} |
|
if(mtmp2.find()) |
|
{ |
|
mentions=mentions+"|"+mtmp2.group(1); |
|
update=true; |
|
} |
|
if(mtmp3.find()) |
|
{ |
|
mentions=mentions+"|"+mtmp3.group(1)+"a"+mtmp3.group(2); |
|
update=true; |
|
} |
|
if(mtmp4.find()) |
|
{ |
|
mentions=mentions+"|"+mtmp4.group(1)+"b"+mtmp4.group(2); |
|
update=true; |
|
} |
|
if(mtmp5.find()) |
|
{ |
|
mentions=mentions+"|"+mtmp5.group(1)+"alpha"; |
|
update=true; |
|
} |
|
if(mtmp6.find()) |
|
{ |
|
mentions=mentions+"|"+mtmp6.group(1)+"beta"; |
|
update=true; |
|
} |
|
if(mtmp7.find()) |
|
{ |
|
mentions=mentions+"|"+mtmp7.group(1)+"2"+mtmp7.group(2); |
|
update=true; |
|
} |
|
if(mtmp8.find()) |
|
{ |
|
mentions=mentions+"|"+mtmp8.group(1)+"3"+mtmp8.group(2); |
|
update=true; |
|
} |
|
} |
|
if(update == true) |
|
{ |
|
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, start + "\t" + last + "\t" + mentions + "\t" + type + "\t" + id ); |
|
} |
|
} |
|
} |
|
} |
|
} |
|
|
|
} |
|
|
|
public void ChromosomeRecognition(String Filename,String FilenameBioC) throws IOException, XMLStreamException |
|
{ |
|
for (int i = 0; i < GNormPlus.BioCDocobj.PMIDs.size(); i++) |
|
{ |
|
String Pmid = GNormPlus.BioCDocobj.PMIDs.get(i); |
|
for (int j = 0; j < GNormPlus.BioCDocobj.PassageNames.get(i).size(); j++) |
|
{ |
|
String PassageContext = GNormPlus.BioCDocobj.PassageContexts.get(i).get(j); |
|
|
|
|
|
ArrayList<String> locations = GNormPlus.PT_GeneChromosome.SearchMentionLocation(PassageContext,"ChromosomeLocation"); |
|
for (int k = 0 ; k < locations.size() ; k++) |
|
{ |
|
String anno[]=locations.get(k).split("\t"); |
|
|
|
|
|
|
|
String ids = anno[3]; |
|
|
|
String IDs[] = ids.split("[\\|,]"); |
|
for(int idcount=0;idcount<IDs.length;idcount++) |
|
{ |
|
|
|
GNormPlus.Pmid2ChromosomeGene_hash.put(Pmid+"\t"+IDs[idcount],""); |
|
} |
|
} |
|
} |
|
} |
|
|
|
} |
|
|
|
public void GeneNormalization(String Filename,String FilenameBioC,boolean GeneIDMatch) throws IOException, XMLStreamException |
|
{ |
|
final DecimalFormat df = new DecimalFormat("0.####"); |
|
df.setRoundingMode(RoundingMode.HALF_UP); |
|
|
|
|
|
for (int i = 0; i < GNormPlus.BioCDocobj.Annotations.size(); i++) |
|
{ |
|
String Pmid = GNormPlus.BioCDocobj.PMIDs.get(i); |
|
|
|
|
|
HashMap<String,String> Species_hash = new HashMap<String,String>(); |
|
for (int j = 0; j < GNormPlus.BioCDocobj.Annotations.get(i).size(); j++) |
|
{ |
|
for (int k = 0; k < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); k++) |
|
{ |
|
String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t"); |
|
String mentions=anno[2]; |
|
String type=anno[3]; |
|
if(type.matches("(Species|Genus|Strain|CellLine|Cell)")) |
|
{ |
|
Species_hash.put(mentions,""); |
|
} |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
String tiabs=""; |
|
for (int j = 0; j < GNormPlus.BioCDocobj.PassageContexts.get(i).size(); j++) |
|
{ |
|
tiabs=tiabs+GNormPlus.BioCDocobj.PassageContexts.get(i).get(j).toLowerCase(); |
|
} |
|
HashMap<String,HashMap<String,String>> GeneMention_hash = new HashMap<String,HashMap<String,String>>(); |
|
HashMap<String,String> Mention_hash = new HashMap<String,String>(); |
|
for (int j = 0; j < GNormPlus.BioCDocobj.Annotations.get(i).size(); j++) |
|
{ |
|
for (int k = 0; k < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); k++) |
|
{ |
|
String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t"); |
|
String start=anno[0]; |
|
String last=anno[1]; |
|
String mentions=anno[2]; |
|
String type=anno[3]; |
|
String taxids="Tax:9606"; |
|
|
|
if(anno.length>=5) |
|
{ |
|
taxids=anno[4]; |
|
} |
|
String mentions_tmp=mentions.toLowerCase(); |
|
mentions_tmp=mentions_tmp.replaceAll("[\\W\\-\\_]",""); |
|
mentions_tmp=mentions_tmp.replaceAll("[0-9]","0"); |
|
taxids=taxids.replaceAll("(Focus|Right|Left|Prefix|Tax):",""); |
|
if(taxids.equals("")) |
|
{ |
|
taxids="9606"; |
|
} |
|
|
|
boolean found_filter = false; |
|
if(GNormPlus.Filtering_hash.containsKey(mentions_tmp)) |
|
{ |
|
found_filter=true; |
|
} |
|
|
|
if(found_filter==false) |
|
{ |
|
for(String f : GNormPlus.Filtering_WithLongForm_hash.keySet()) |
|
{ |
|
if( GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).matches(".*[\\t\\|]"+f+"\tGene.*") || |
|
GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).matches(".*\\t"+f+"\\|[^\t]+\tGene.*") |
|
) |
|
{ |
|
String lf=GNormPlus.Filtering_WithLongForm_hash.get(f); |
|
if(tiabs.matches(".*"+lf+".*")) |
|
{ |
|
found_filter=true; |
|
break; |
|
} |
|
} |
|
} |
|
} |
|
|
|
if(found_filter==false) |
|
{ |
|
if( GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).matches(".*[\\t\\|][a-z]\tGene.*") || |
|
GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).matches(".*\\t[a-z]\\|[^\t]+\tGene.*") |
|
) |
|
{ |
|
found_filter=true; |
|
|
|
} |
|
} |
|
|
|
if(found_filter == false) |
|
{ |
|
if(type.matches("Gene")) |
|
{ |
|
if(GeneMention_hash.containsKey(mentions+"\t"+taxids)) |
|
{ |
|
GeneMention_hash.get(mentions+"\t"+taxids).put(start+"\t"+last,""); |
|
} |
|
else |
|
{ |
|
HashMap<String,String> offset_hash = new HashMap<String,String>(); |
|
offset_hash.put(start+"\t"+last,""); |
|
GeneMention_hash.put(mentions+"\t"+taxids, offset_hash); |
|
GeneMention_hash.get(mentions+"\t"+taxids).put("type", type); |
|
Mention_hash.put(mentions,"Gene"); |
|
} |
|
} |
|
else if(type.matches("(FamilyName|DomainMotif)")) |
|
{ |
|
String GMs[]=mentions.split("\\|"); |
|
for(int g=0;g<GMs.length;g++) |
|
{ |
|
String mention = GMs[g]; |
|
Mention_hash.put(mention,"FamilyDomain"); |
|
} |
|
} |
|
} |
|
|
|
} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
HashMap<String,String> GuaranteedGene2ID = new HashMap<String,String>(); |
|
HashMap<String,String> MultiGene2ID = new HashMap<String,String>(); |
|
for(String GeneMentionTax : GeneMention_hash.keySet()) |
|
{ |
|
String GT[]=GeneMentionTax.split("\\t"); |
|
String mentions=GT[0]; |
|
String taxids=GT[1]; |
|
String GMs[]=mentions.split("\\|"); |
|
|
|
HashMap<String,String> taxids_hash = new HashMap<String,String>(); |
|
String taxids_arr[]=taxids.split(","); |
|
for(int t=0;t<taxids_arr.length;t++) |
|
{ |
|
taxids_hash.put(taxids_arr[t], ""); |
|
} |
|
|
|
for(int ms=0;ms<GMs.length;ms++) |
|
{ |
|
String mention = GMs[ms]; |
|
String IDstr = GNormPlus.PT_Gene.MentionMatch(mention); |
|
String IDs[]=IDstr.split("\\|"); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for(int c=0;c<IDs.length;c++) |
|
{ |
|
String tax2ID[]=IDs[c].split(":"); |
|
if(taxids_hash.containsKey(tax2ID[0])) |
|
{ |
|
String geneid=tax2ID[1]; |
|
String TargetTax=tax2ID[0]; |
|
GeneMention_hash.get(GeneMentionTax).put("ID", geneid); |
|
GeneMention_hash.get(GeneMentionTax).put("TargetTax", TargetTax); |
|
break; |
|
} |
|
} |
|
|
|
|
|
if(GeneMention_hash.get(GeneMentionTax).containsKey("ID")) |
|
{ |
|
Pattern ptmp = Pattern.compile("\\*([0-9]+(\\-[0-9]+|))"); |
|
Matcher mtmp = ptmp.matcher(GeneMention_hash.get(GeneMentionTax).get("ID")); |
|
|
|
if(mtmp.find()) |
|
{ |
|
GeneMention_hash.get(GeneMentionTax).put("ID",mtmp.group(1)); |
|
GuaranteedGene2ID.put(GeneMentionTax,mtmp.group(1)); |
|
} |
|
else if(GeneMention_hash.get(GeneMentionTax).get("ID").matches("[0-9]+(\\-[0-9]+|)")) |
|
{ |
|
GuaranteedGene2ID.put(GeneMentionTax,GeneMention_hash.get(GeneMentionTax).get("ID")); |
|
} |
|
else |
|
{ |
|
String ID[] = GeneMention_hash.get(GeneMentionTax).get("ID").split(","); |
|
boolean FoundByChroLoca=false; |
|
for(int idcount=0;idcount<ID.length;idcount++) |
|
{ |
|
if(GNormPlus.Pmid2ChromosomeGene_hash.containsKey(Pmid+"\t"+ID[idcount])) |
|
{ |
|
GuaranteedGene2ID.put(GeneMentionTax,ID[idcount]); |
|
FoundByChroLoca=true; |
|
break; |
|
} |
|
} |
|
if(FoundByChroLoca == false) |
|
{ |
|
MultiGene2ID.put(GeneMentionTax, GeneMention_hash.get(GeneMentionTax).get("ID")); |
|
} |
|
} |
|
} |
|
if(GNormPlus.suffixprefix_orig2modified.containsKey(mention) && (!IDstr.equals("-1")) && (!IDstr.equals("-2")) && (!IDstr.equals("-3"))) |
|
{ |
|
break; |
|
} |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
for(String GeneMentionTax_M : MultiGene2ID.keySet()) |
|
{ |
|
for(String GeneMentionTax_G : GuaranteedGene2ID.keySet()) |
|
{ |
|
String MG[] = MultiGene2ID.get(GeneMentionTax_M).split(","); |
|
for(int m=0;m<MG.length;m++) |
|
{ |
|
if(MG[m].equals(GuaranteedGene2ID.get(GeneMentionTax_G))) |
|
{ |
|
GeneMention_hash.get(GeneMentionTax_M).put("ID",MG[m]); |
|
} |
|
} |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
for(String GeneMentionTax : GeneMention_hash.keySet()) |
|
{ |
|
String MT[] = GeneMentionTax.split("\\t"); |
|
if(GNormPlus.PmidLF2Abb_hash.containsKey(Pmid+"\t"+MT[0])) |
|
{ |
|
String GeneMentionTax_Abb = GNormPlus.PmidLF2Abb_hash.get(Pmid+"\t"+MT[0]) + "\t" + MT[1]; |
|
if(GeneMention_hash.containsKey(GeneMentionTax_Abb) && GeneMention_hash.get(GeneMentionTax).containsKey("ID")) |
|
{ |
|
GeneMention_hash.get(GeneMentionTax_Abb).put("ID", GeneMention_hash.get(GeneMentionTax).get("ID")); |
|
} |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
for(String GeneMentionTax : GeneMention_hash.keySet()) |
|
{ |
|
if(GeneMention_hash.get(GeneMentionTax).containsKey("ID") && GeneMention_hash.get(GeneMentionTax).get("ID").matches(".+,.+")) |
|
{ |
|
String geneids=GeneMention_hash.get(GeneMentionTax).get("ID"); |
|
String geneid[] = geneids.split(","); |
|
|
|
String OutputStyle="Top1"; |
|
if(OutputStyle.equals("Top1")) |
|
{ |
|
|
|
double max_score=0.0; |
|
String target_geneid=""; |
|
for(int g=0;g<geneid.length;g++) |
|
{ |
|
String MT[] = GeneMentionTax.split("\\t"); |
|
String LF=""; |
|
if(GNormPlus.PmidAbb2LF_hash.containsKey(Pmid+"\t"+MT[0])) |
|
{ |
|
LF = GNormPlus.PmidAbb2LF_hash.get(Pmid+"\t"+MT[0]); |
|
} |
|
double score = ScoringFunction(geneid[g],Mention_hash,LF); |
|
if(score>max_score) |
|
{ |
|
max_score=score; |
|
target_geneid=geneid[g]; |
|
} |
|
else if(score == 0.0) |
|
{ |
|
|
|
} |
|
} |
|
GeneMention_hash.get(GeneMentionTax).put("ID", target_geneid); |
|
} |
|
else |
|
{ |
|
|
|
String geneSTR=""; |
|
for(int g=0;g<geneid.length;g++) |
|
{ |
|
String MT[] = GeneMentionTax.split("\\t"); |
|
String LF=""; |
|
if(GNormPlus.PmidAbb2LF_hash.containsKey(Pmid+"\t"+MT[0])) |
|
{ |
|
LF = GNormPlus.PmidAbb2LF_hash.get(Pmid+"\t"+MT[0]); |
|
} |
|
double score = ScoringFunction(geneid[g],Mention_hash,LF); |
|
String hoge = df.format(score); |
|
score=Double.parseDouble(hoge); |
|
|
|
if(geneSTR.equals("")) |
|
{ |
|
geneSTR=geneid[g]+"-"+score; |
|
} |
|
else |
|
{ |
|
geneSTR=geneSTR+","+geneid[g]+"-"+score; |
|
} |
|
} |
|
GeneMention_hash.get(GeneMentionTax).put("ID", geneSTR); |
|
} |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
for(String GeneMentionTax : GeneMention_hash.keySet()) |
|
{ |
|
String MT[] = GeneMentionTax.split("\\t"); |
|
if(GNormPlus.PmidAbb2LF_hash.containsKey(Pmid+"\t"+MT[0])) |
|
{ |
|
String GeneMentionTax_LF = GNormPlus.PmidAbb2LF_hash.get(Pmid+"\t"+MT[0]) + "\t" + MT[1]; |
|
if(GeneMention_hash.containsKey(GeneMentionTax_LF) && GeneMention_hash.get(GeneMentionTax).containsKey("ID")) |
|
{ |
|
GeneMention_hash.get(GeneMentionTax_LF).put("ID", GeneMention_hash.get(GeneMentionTax).get("ID")); |
|
} |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
ArrayList<String> removeGMT = new ArrayList<String>(); |
|
for(String GeneMentionTax : GeneMention_hash.keySet()) |
|
{ |
|
String GT[]=GeneMentionTax.split("\\t"); |
|
String mentions=GT[0]; |
|
String tax=GT[1]; |
|
if(GeneMention_hash.get(GeneMentionTax).containsKey("type") && GeneMention_hash.get(GeneMentionTax).get("type").equals("Gene") && GeneMention_hash.get(GeneMentionTax).containsKey("ID")) |
|
{ |
|
String type = GeneMention_hash.get(GeneMentionTax).get("type"); |
|
String id = GeneMention_hash.get(GeneMentionTax).get("ID"); |
|
String geneid=""; |
|
Pattern ptmp1 = Pattern.compile("^([0-9]+)\\-([0-9]+)$"); |
|
Pattern ptmp2 = Pattern.compile("^([0-9]+)$"); |
|
Matcher mtmp1 = ptmp1.matcher(id); |
|
Matcher mtmp2 = ptmp2.matcher(id); |
|
|
|
if(mtmp1.find()) |
|
{ |
|
geneid = "Homo:"+mtmp1.group(2); |
|
} |
|
else if(mtmp2.find()) |
|
{ |
|
geneid = "Gene:"+mtmp2.group(1); |
|
} |
|
|
|
boolean LongFormTknMatch= false; |
|
boolean LongFormExist= true; |
|
if(GNormPlus.GeneScoring_hash.containsKey(geneid)) |
|
{ |
|
if(GNormPlus.PmidAbb2LF_lc_hash.containsKey(Pmid+"\t"+mentions.toLowerCase())) |
|
{ |
|
|
|
|
|
|
|
|
|
String l[]=GNormPlus.GeneScoring_hash.get(geneid).split("\t"); |
|
String tkns_Gene[] = l[0].split(","); |
|
ArrayList<String> tkn_lexicon = new ArrayList<String>(); |
|
for(int ti=0;ti<tkns_Gene.length;ti++) |
|
{ |
|
String Tkn_Freq[] = tkns_Gene[ti].split("-"); |
|
tkn_lexicon.add(Tkn_Freq[0]); |
|
} |
|
|
|
String LF_lc=GNormPlus.PmidAbb2LF_lc_hash.get(Pmid+"\t"+mentions.toLowerCase()); |
|
LF_lc = LF_lc.replaceAll("([0-9])([A-Za-z])", "$1 $2"); |
|
LF_lc = LF_lc.replaceAll("([A-Za-z])([0-9])", "$1 $2"); |
|
String tkn_mention[] = LF_lc.split("[\\W\\-\\_]"); |
|
for(int tl=0;tl<tkn_lexicon.size();tl++) |
|
{ |
|
for(int tm=0;tm<tkn_mention.length;tm++) |
|
{ |
|
if(tkn_lexicon.get(tl).equals(tkn_mention[tm]) && (!tkn_mention[tm].matches("[0-9]+"))) |
|
{ |
|
LongFormTknMatch = true; |
|
} |
|
} |
|
} |
|
} |
|
else{LongFormExist = false;} |
|
} |
|
else{LongFormTknMatch = true;} |
|
|
|
if(LongFormTknMatch == false && LongFormExist == true) |
|
{ |
|
removeGMT.add(GeneMentionTax); |
|
removeGMT.add(GNormPlus.PmidAbb2LF_hash.get(Pmid+"\t"+mentions)+"\t"+tax); |
|
} |
|
else if(mentions.length()<=2 && LongFormExist == false) |
|
{ |
|
removeGMT.add(GeneMentionTax); |
|
} |
|
} |
|
} |
|
|
|
for(int gmti=0;gmti<removeGMT.size();gmti++) |
|
{ |
|
GeneMention_hash.remove(removeGMT.get(gmti)); |
|
} |
|
|
|
|
|
for (int j = 0; j < GNormPlus.BioCDocobj.Annotations.get(i).size(); j++) |
|
{ |
|
for (int k = 0; k < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); k++) |
|
{ |
|
String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t"); |
|
String start=anno[0]; |
|
String last=anno[1]; |
|
String mentions=anno[2]; |
|
String type=anno[3]; |
|
String taxid_org="Tax:9606"; |
|
if(anno.length>=5) |
|
{ |
|
taxid_org=anno[4]; |
|
} |
|
String taxids=taxid_org.replaceAll("(Focus|Right|Left|Prefix|Tax):",""); |
|
String GMs[]=mentions.split("\\|"); |
|
|
|
if(GeneMention_hash.containsKey(mentions+"\t"+taxids) && GeneMention_hash.get(mentions+"\t"+taxids).containsKey("TargetTax")) |
|
{ |
|
String taxtype=taxid_org.replaceAll(":([0-9,]+)",""); |
|
String taxid=GeneMention_hash.get(mentions+"\t"+taxids).get("TargetTax"); |
|
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, start+"\t"+last+"\t"+mentions+"\t"+type+"\t"+taxtype+":"+taxid); |
|
} |
|
|
|
if(type.equals("Gene")) |
|
{ |
|
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k) + "|"); |
|
|
|
|
|
if(GeneMention_hash.containsKey(mentions+"\t"+taxids) && GeneMention_hash.get(mentions+"\t"+taxids).containsKey("ID")) |
|
{ |
|
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k) + GeneMention_hash.get(mentions+"\t"+taxids).get("ID") + "," ); |
|
} |
|
else |
|
{ |
|
|
|
} |
|
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).substring(0, GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).length()-1)); |
|
} |
|
} |
|
} |
|
|
|
|
|
HashMap<String,String> GeneMentions = new HashMap<String,String>(); |
|
HashMap<String,String> GeneMentionLocation = new HashMap<String,String>(); |
|
for(int j=0;j<GNormPlus.BioCDocobj.Annotations.get(i).size();j++) |
|
{ |
|
for (int k = 0; k < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); k++) |
|
{ |
|
String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t"); |
|
int start = Integer.parseInt(anno[0]); |
|
int last = Integer.parseInt(anno[1]); |
|
String mentions=anno[2]; |
|
String type=anno[3]; |
|
String id="Tax:9606"; |
|
if(anno.length>=5) |
|
{ |
|
id=anno[4]; |
|
} |
|
if(type.equals("Gene") && id.matches("(Focus|Right|Left|Prefix|Tax)\\:([0-9]+)\\|([0-9]+)\\-([0-9]+)")) |
|
{ |
|
GeneMentions.put(mentions.toLowerCase(), id); |
|
for (int s=start ;s<=last;s++) |
|
{ |
|
GeneMentionLocation.put(j+"\t"+s,""); |
|
} |
|
} |
|
else if(type.equals("Gene") && id.matches("(Focus|Right|Left|Prefix|Tax)\\:([0-9]+)\\|([0-9]+)")) |
|
{ |
|
GeneMentions.put(mentions.toLowerCase(), id); |
|
for (int s=start ;s<=last;s++) |
|
{ |
|
GeneMentionLocation.put(j+"\t"+s,""); |
|
} |
|
} |
|
} |
|
} |
|
for(int j=0;j<GNormPlus.BioCDocobj.Annotations.get(i).size();j++) |
|
{ |
|
if(GNormPlus.BioCDocobj.PassageContexts.size()>i && GNormPlus.BioCDocobj.PassageContexts.get(i).size()>j) |
|
{ |
|
String PassageContexts = " " + GNormPlus.BioCDocobj.PassageContexts.get(i).get(j) + " "; |
|
String PassageContexts_tmp = PassageContexts.toLowerCase(); |
|
for(String gm : GeneMentions.keySet()) |
|
{ |
|
String id = GeneMentions.get(gm); |
|
if(gm.length()>=3) |
|
{ |
|
gm = gm.replaceAll("[ ]*[\\|]*$", ""); |
|
gm = gm.replaceAll("^[\\|]*[ ]*", ""); |
|
gm = gm.replaceAll("[\\|][\\|]+", "\\|"); |
|
if(!gm.matches("[\\W\\-\\_]*")) |
|
{ |
|
gm = gm.replaceAll("([^A-Za-z0-9\\| ])", "\\\\$1"); |
|
Pattern ptmp = Pattern.compile("^(.*[\\W\\-\\_])("+gm+")([\\W\\-\\_].*)$"); |
|
Matcher mtmp = ptmp.matcher(PassageContexts_tmp); |
|
while(mtmp.find()) |
|
{ |
|
String pre = mtmp.group(1); |
|
String gmtmp = mtmp.group(2); |
|
String post = mtmp.group(3); |
|
|
|
int start = pre.length()-1; |
|
int last = start+gmtmp.length(); |
|
if(PassageContexts.length()>=last+1) |
|
{ |
|
String mention = PassageContexts.substring(start+1,last+1); |
|
if(!GeneMentionLocation.containsKey(j+"\t"+start) && !GeneMentionLocation.containsKey(j+"\t"+last)) |
|
{ |
|
GNormPlus.BioCDocobj.Annotations.get(i).get(j).add(start+"\t"+last+"\t"+mention+"\tGene\t"+id); |
|
} |
|
} |
|
gmtmp = gmtmp.replaceAll(".", "\\@"); |
|
PassageContexts_tmp=pre+""+gmtmp+""+post; |
|
mtmp = ptmp.matcher(PassageContexts_tmp); |
|
} |
|
} |
|
} |
|
} |
|
} |
|
} |
|
|
|
|
|
HashMap<String,String> geneids = new HashMap<String,String>(); |
|
for(int j=0;j<GNormPlus.BioCDocobj.Annotations.get(i).size();j++) |
|
{ |
|
for (int k = 0; k < GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); k++) |
|
{ |
|
String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t"); |
|
String type=anno[3]; |
|
if(type.equals("Gene")) |
|
{ |
|
String id="Tax:9606"; |
|
if(anno.length>=5) |
|
{ |
|
id=anno[4]; |
|
} |
|
Pattern ptmp0 = Pattern.compile("^(Focus|Right|Left|Prefix|GeneID|Tax)\\:([0-9]+)\\|([0-9]+)$"); |
|
Matcher mtmp0 = ptmp0.matcher(id); |
|
Pattern ptmp1 = Pattern.compile("^(Focus|Right|Left|Prefix|GeneID|Tax)\\:([0-9]+)\\|([0-9]+)\\-([0-9]+)$"); |
|
Matcher mtmp1 = ptmp1.matcher(id); |
|
if(mtmp0.find()) |
|
{ |
|
geneids.put(mtmp0.group(3), ""); |
|
} |
|
if(mtmp1.find()) |
|
{ |
|
geneids.put(mtmp1.group(3), ""); |
|
} |
|
} |
|
} |
|
} |
|
for(int j=0;j<GNormPlus.BioCDocobj.Annotations.get(i).size();j++) |
|
{ |
|
for (int k = GNormPlus.BioCDocobj.Annotations.get(i).get(j).size()-1; k >=0 ; k--) |
|
{ |
|
String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t"); |
|
String mention=anno[2]; |
|
String type=anno[3]; |
|
if(type.matches("(FamilyName|DomainMotif)")) |
|
{ |
|
String id="Tax:9606"; |
|
if(anno.length>=5) |
|
{ |
|
id=anno[4]; |
|
} |
|
String IDstrs = GNormPlus.PT_FamilyName.MentionMatch(mention); |
|
String IDstr[]=IDstrs.split("\\|"); |
|
String ids=""; |
|
for(int id_i=0;id_i<IDstr.length;id_i++) |
|
{ |
|
if(geneids.containsKey(IDstr[id_i])) |
|
{ |
|
if(ids.equals("")) |
|
{ |
|
ids=IDstr[id_i]; |
|
} |
|
else |
|
{ |
|
ids=ids+";"+IDstr[id_i]; |
|
} |
|
} |
|
} |
|
if(!ids.equals("")) |
|
{ |
|
if(type.equals("FamilyName")){type="Gene";} |
|
String Annotation_k=anno[0]+"\t"+anno[1]+"\t"+anno[2]+"\t"+type+"\tTax:9606"; |
|
if(anno.length>=5) |
|
{ |
|
Annotation_k=anno[0]+"\t"+anno[1]+"\t"+anno[2]+"\t"+type+"\t"+anno[4]; |
|
} |
|
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k,Annotation_k+"|"+ids); |
|
} |
|
else |
|
{ |
|
GNormPlus.BioCDocobj.Annotations.get(i).get(j).remove(k); |
|
} |
|
} |
|
} |
|
} |
|
|
|
for(int j=0;j<GNormPlus.BioCDocobj.Annotations.get(i).size();j++) |
|
{ |
|
for (int k = GNormPlus.BioCDocobj.Annotations.get(i).get(j).size()-1; k >=0 ; k--) |
|
{ |
|
String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t"); |
|
String type=anno[3]; |
|
if(type.equals("Species") || type.equals("Genus") || type.equals("Strain") || type.equals("CellLine") || type.equals("Cell")) |
|
{ |
|
String id=anno[4]; |
|
id=id.replaceAll("\\*", ""); |
|
id=id.replaceAll("\\(anti\\)", ""); |
|
String Annotation_k=anno[0]+"\t"+anno[1]+"\t"+anno[2]+"\t"+type+"\t"+id; |
|
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k,Annotation_k); |
|
} |
|
} |
|
} |
|
|
|
for(int j=0;j<GNormPlus.BioCDocobj.Annotations.get(i).size();j++) |
|
{ |
|
|
|
for (int k = GNormPlus.BioCDocobj.Annotations.get(i).get(j).size()-1; k >=0 ; k--) |
|
{ |
|
String anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\t"); |
|
int start = Integer.parseInt(anno[0]); |
|
int last = Integer.parseInt(anno[1]); |
|
String mention = anno[2]; |
|
String type = anno[3]; |
|
String id = anno[4]; |
|
if(type.equals("Gene") && Species_hash.containsKey(mention)) |
|
{ |
|
GNormPlus.BioCDocobj.Annotations.get(i).get(j).remove(k); |
|
} |
|
else if(type.equals("Gene") && id.equals("")) |
|
{ |
|
GNormPlus.BioCDocobj.Annotations.get(i).get(j).remove(k); |
|
} |
|
else |
|
{ |
|
for (int k1 = GNormPlus.BioCDocobj.Annotations.get(i).get(j).size()-1; k1 >=0 ; k1--) |
|
{ |
|
if(k1 != k) |
|
{ |
|
String anno1[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k1).split("\t"); |
|
int start1 = Integer.parseInt(anno1[0]); |
|
int last1 = Integer.parseInt(anno1[1]); |
|
if((start1<start && last1>=last) || (start1<=start && last1>last)) |
|
{ |
|
GNormPlus.BioCDocobj.Annotations.get(i).get(j).remove(k); |
|
break; |
|
} |
|
} |
|
} |
|
} |
|
} |
|
} |
|
} |
|
if(GeneIDMatch == true) |
|
{ |
|
|
|
} |
|
else |
|
{ |
|
GNormPlus.BioCDocobj.BioCOutput(Filename,FilenameBioC,GNormPlus.BioCDocobj.Annotations,true,true); |
|
} |
|
} |
|
|
|
|
|
|
|
public ArrayList<String> SearchGeneIDLocation(String Doc) |
|
{ |
|
ArrayList<String> location = new ArrayList<String>(); |
|
|
|
String Doc_tmp=" "+Doc+" "; |
|
Pattern ptmp = Pattern.compile("^(.*[^A-Za-z0-9]+)([0-9]+\\S*[A-Za-z]+|[A-Za-z]+\\S*[0-9]+|[0-9]+\\S*[A-Za-z]+\\S*[0-9]+|[A-Za-z]+\\S*[0-9]+\\S*[A-Za-z]+)([^A-Za-z0-9]+.*)$"); |
|
Matcher mtmp = ptmp.matcher(Doc_tmp); |
|
while(mtmp.find()) |
|
{ |
|
String str1=mtmp.group(1); |
|
String str2=mtmp.group(2); |
|
String str3=mtmp.group(3); |
|
for(int m=str1.length();m<=(str1.length()+str2.length());m++) |
|
{ |
|
int start = str1.length()-1; |
|
int last = start+str2.length(); |
|
String mention = Doc.substring(start, last); |
|
if(!mention.matches(".*[\\'\\;\\[\\]\\+\\*\\\\].*")) |
|
{ |
|
if(last-start>6 && (mention.matches(".*\\(.*\\).*") || mention.matches("[^\\(\\)]+")) ) |
|
{ |
|
Pattern ptmp1 = Pattern.compile("^(.+[^0-9])([0-9]+)\\-([0-9]+)$"); |
|
Matcher mtmp1 = ptmp1.matcher(mention); |
|
Pattern ptmp2 = Pattern.compile("^(.+[^0-9])([0-9]+)\\-(.+[^0-9])([0-9]+)$"); |
|
Matcher mtmp2 = ptmp2.matcher(mention); |
|
if(mtmp1.find()) |
|
{ |
|
String S1 = mtmp1.group(1); |
|
if(mtmp1.group(2).length()<=6 && mtmp1.group(3).length()<=6) |
|
{ |
|
int Num1 = Integer.parseInt(mtmp1.group(2)); |
|
int Num2 = Integer.parseInt(mtmp1.group(3)); |
|
String prefix = ""; |
|
Pattern ptmp3 = Pattern.compile("^([0]+)"); |
|
Matcher mtmp3 = ptmp3.matcher(mtmp1.group(2)); |
|
if(mtmp3.find()) |
|
{ |
|
prefix = mtmp3.group(1); |
|
} |
|
if(Num2-Num1>0 && (Num2-Num1<=20)) |
|
{ |
|
for(int n=Num1;n<=Num2;n++) |
|
{ |
|
String StrNum=S1+prefix+n; |
|
if(StrNum.length()>=5) |
|
{ |
|
location.add(start+"\t"+last+"\t"+StrNum+"\tGeneID"); |
|
} |
|
} |
|
} |
|
} |
|
} |
|
else if(mtmp2.find()) |
|
{ |
|
if(mtmp2.group(2).length()<=6 && mtmp2.group(4).length()<=6) |
|
{ |
|
String S1 = mtmp2.group(1); |
|
int Num1 = Integer.parseInt(mtmp2.group(2)); |
|
String S2 = mtmp2.group(3); |
|
int Num2 = Integer.parseInt(mtmp2.group(4)); |
|
if(S1.equals(S2)) |
|
{ |
|
String prefix = ""; |
|
Pattern ptmp3 = Pattern.compile("^([0]+)"); |
|
Matcher mtmp3 = ptmp3.matcher(mtmp2.group(2)); |
|
if(mtmp3.find()) |
|
{ |
|
prefix = mtmp3.group(1); |
|
} |
|
if(Num2-Num1>0 && (Num2-Num1<=20)) |
|
{ |
|
for(int n=Num1;n<=Num2;n++) |
|
{ |
|
String StrNum=S1+prefix+n; |
|
if(StrNum.length()>=5) |
|
{ |
|
location.add(start+"\t"+last+"\t"+StrNum+"\tGeneID"); |
|
} |
|
} |
|
} |
|
} |
|
} |
|
} |
|
} |
|
location.add(start+"\t"+last+"\t"+mention+"\tGeneID"); |
|
} |
|
} |
|
String men=""; |
|
for(int m=0;m<str2.length();m++){men=men+"@";} |
|
Doc_tmp=str1+men+str3; |
|
mtmp = ptmp.matcher(Doc_tmp); |
|
} |
|
return location; |
|
} |
|
public void GeneIDRecognition(String Filename,String FilenameBioC) throws IOException, XMLStreamException |
|
{ |
|
for (int i = 0; i < GNormPlus.BioCDocobj.PMIDs.size(); i++) |
|
{ |
|
for (int j = 0; j < GNormPlus.BioCDocobj.PassageNames.get(i).size(); j++) |
|
{ |
|
String PassageContext = GNormPlus.BioCDocobj.PassageContexts.get(i).get(j); |
|
|
|
ArrayList<String> locations = SearchGeneIDLocation(PassageContext); |
|
for (int k = 0 ; k < locations.size() ; k++) |
|
{ |
|
String anno[]=locations.get(k).split("\t"); |
|
String mention = anno[2].toLowerCase(); |
|
mention = mention.replaceAll("[\\W\\-\\_]+", ""); |
|
if(GNormPlus.GeneIDs_hash.containsKey(mention)) |
|
{ |
|
GNormPlus.BioCDocobj.Annotations.get(i).get(j).add(locations.get(k)+"\tGeneID:"+GNormPlus.GeneIDs_hash.get(mention)); |
|
} |
|
} |
|
} |
|
} |
|
GNormPlus.BioCDocobj.BioCOutput(Filename,FilenameBioC,GNormPlus.BioCDocobj.Annotations,true,true); |
|
} |
|
} |