|
|
|
|
|
|
|
|
|
|
|
package GNormPluslib;
|
|
|
|
import java.io.*;
|
|
import java.util.*;
|
|
import java.util.regex.Matcher;
|
|
import java.util.regex.Pattern;
|
|
import javax.xml.stream.XMLStreamException;
|
|
|
|
import org.tartarus.snowball.SnowballStemmer;
|
|
import org.tartarus.snowball.ext.englishStemmer;
|
|
|
|
import GNormPluslib.GNormPlus;
|
|
import GNormPluslib.BioCDoc;
|
|
|
|
public class GNR
|
|
{
|
|
|
|
|
|
|
|
public void Ab3P(String Filename,String FilenameAbb,String TrainTest) throws XMLStreamException,IOException
|
|
{
|
|
|
|
|
|
String line="";
|
|
BufferedWriter FileAbb = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(FilenameAbb), "UTF-8"));
|
|
for (int i = 0; i < GNormPlus.BioCDocobj.PMIDs.size(); i++)
|
|
{
|
|
String Pmid = GNormPlus.BioCDocobj.PMIDs.get(i);
|
|
String Context="";
|
|
for (int j = 0; j < GNormPlus.BioCDocobj.PassageNames.get(i).size(); j++)
|
|
{
|
|
String PassageContext=GNormPlus.BioCDocobj.PassageContexts.get(i).get(j);
|
|
if(PassageContext.matches(".*\\([^\\(\\)]+,[^\\(\\)]+\\).*"))
|
|
{
|
|
PassageContext=PassageContext.replaceAll("\\([^\\(\\)]+,[^\\(\\)]+\\)", "");
|
|
}
|
|
if(PassageContext.contains("\\("))
|
|
{
|
|
Context = Context+PassageContext+" ";
|
|
}
|
|
}
|
|
FileAbb.write(Pmid+"\n"+Context+"\n\n");
|
|
}
|
|
FileAbb.close();
|
|
|
|
File f = new File(FilenameAbb+".out");
|
|
BufferedWriter fr = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(f), "UTF-8"));
|
|
Runtime runtime = Runtime.getRuntime();
|
|
String cmd ="./Ab3P "+FilenameAbb+".Abb "+FilenameAbb+".out";
|
|
|
|
String OS=System.getProperty("os.name").toLowerCase();
|
|
if(OS.contains("windows"))
|
|
{
|
|
cmd ="java -jar bioadi.jar "+FilenameAbb;
|
|
}
|
|
else
|
|
{
|
|
cmd ="./Ab3P "+FilenameAbb+" "+FilenameAbb+".out";
|
|
|
|
}
|
|
|
|
Process process = runtime.exec(cmd);
|
|
InputStream is = process.getInputStream();
|
|
InputStreamReader isr = new InputStreamReader(is, "UTF-8");
|
|
BufferedReader br = new BufferedReader(isr);
|
|
line="";
|
|
while ( (line = br.readLine()) != null)
|
|
{
|
|
fr.write(line);
|
|
fr.newLine();
|
|
fr.flush();
|
|
}
|
|
is.close();
|
|
isr.close();
|
|
br.close();
|
|
fr.close();
|
|
|
|
BufferedReader inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(FilenameAbb+".out"), "UTF-8"));
|
|
line="";
|
|
String pmid="";
|
|
while ((line = inputfile.readLine()) != null)
|
|
{
|
|
String patt="^ (.+)\\|(.+)\\|([0-9\\.]+)$";
|
|
Pattern ptmp = Pattern.compile(patt);
|
|
Matcher mtmp = ptmp.matcher(line);
|
|
if(line.matches("^[0-9]+$"))
|
|
{
|
|
pmid=line;
|
|
}
|
|
if(mtmp.find())
|
|
{
|
|
String SF = mtmp.group(1);
|
|
String LF = mtmp.group(2);
|
|
double weight= Double.parseDouble(mtmp.group(3));
|
|
GNormPlus.Pmid2Abb_hash.put(pmid+"\t"+SF, "Abb:SF");
|
|
GNormPlus.Pmid2Abb_hash.put(pmid+"\t"+LF, "Abb:LF");
|
|
GNormPlus.PmidLF2Abb_lc_hash.put(pmid+"\t"+LF.toLowerCase(), SF.toLowerCase());
|
|
GNormPlus.PmidAbb2LF_lc_hash.put(pmid+"\t"+SF.toLowerCase(), LF.toLowerCase());
|
|
GNormPlus.PmidAbb2LF_hash.put(pmid+"\t"+SF, LF);
|
|
if(weight >= 0.9)
|
|
{
|
|
GNormPlus.PmidLF2Abb_hash.put(pmid+"\t"+LF, SF);
|
|
}
|
|
}
|
|
}
|
|
inputfile.close();
|
|
}
|
|
|
|
public void LoadInputFile(String Filename,String FilenameAbb,String TrainTest) throws XMLStreamException,IOException
|
|
{
|
|
|
|
|
|
|
|
GNormPlus.BioCDocobj.BioCReaderWithAnnotation(Filename);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
String line="";
|
|
BufferedWriter FileAbb = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(FilenameAbb), "UTF-8"));
|
|
for (int i = 0; i < GNormPlus.BioCDocobj.PMIDs.size(); i++)
|
|
{
|
|
String Pmid = GNormPlus.BioCDocobj.PMIDs.get(i);
|
|
String Context="Text:";
|
|
for (int j = 0; j < GNormPlus.BioCDocobj.PassageNames.get(i).size(); j++)
|
|
{
|
|
String PassageContext=GNormPlus.BioCDocobj.PassageContexts.get(i).get(j);
|
|
if(PassageContext.matches(".*\\([^\\(\\)]+,[^\\(\\)]+\\).*"))
|
|
{
|
|
PassageContext=PassageContext.replaceAll("\\([^\\(\\)]+,[^\\(\\)]+\\)", "");
|
|
}
|
|
if(PassageContext.contains("("))
|
|
{
|
|
Context = Context+PassageContext+" ";
|
|
}
|
|
}
|
|
FileAbb.write(Pmid+"\n"+Context+"\n\n");
|
|
}
|
|
FileAbb.close();
|
|
|
|
File f = new File(FilenameAbb+".out");
|
|
BufferedWriter fr = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(f), "UTF-8"));
|
|
Runtime runtime = Runtime.getRuntime();
|
|
String cmd ="./Ab3P "+FilenameAbb+".Abb "+FilenameAbb+".out";
|
|
|
|
String OS=System.getProperty("os.name").toLowerCase();
|
|
if(OS.contains("windows"))
|
|
{
|
|
cmd ="java -jar bioadi.jar "+FilenameAbb;
|
|
}
|
|
else
|
|
{
|
|
cmd ="./Ab3P "+FilenameAbb+" "+FilenameAbb+".out";
|
|
|
|
}
|
|
|
|
Process process = runtime.exec(cmd);
|
|
InputStream is = process.getInputStream();
|
|
InputStreamReader isr = new InputStreamReader(is, "UTF-8");
|
|
BufferedReader br = new BufferedReader(isr);
|
|
line="";
|
|
while ( (line = br.readLine()) != null)
|
|
{
|
|
fr.write(line);
|
|
fr.newLine();
|
|
fr.flush();
|
|
}
|
|
is.close();
|
|
isr.close();
|
|
br.close();
|
|
fr.close();
|
|
|
|
BufferedReader inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(FilenameAbb+".out"), "UTF-8"));
|
|
line="";
|
|
String pmid="";
|
|
while ((line = inputfile.readLine()) != null)
|
|
{
|
|
String patt="^ (.+)\\|(.+)\\|([0-9\\.]+)$";
|
|
Pattern ptmp = Pattern.compile(patt);
|
|
Matcher mtmp = ptmp.matcher(line);
|
|
if(line.matches("^[0-9]+$"))
|
|
{
|
|
pmid=line;
|
|
}
|
|
if(mtmp.find())
|
|
{
|
|
String SF = mtmp.group(1);
|
|
String LF = mtmp.group(2);
|
|
double weight= Double.parseDouble(mtmp.group(3));
|
|
GNormPlus.Pmid2Abb_hash.put(pmid+"\t"+SF, "Abb:SF");
|
|
GNormPlus.Pmid2Abb_hash.put(pmid+"\t"+LF, "Abb:LF");
|
|
GNormPlus.PmidLF2Abb_lc_hash.put(pmid+"\t"+LF.toLowerCase(), SF.toLowerCase());
|
|
GNormPlus.PmidAbb2LF_lc_hash.put(pmid+"\t"+SF.toLowerCase(), LF.toLowerCase());
|
|
GNormPlus.PmidAbb2LF_hash.put(pmid+"\t"+SF, LF);
|
|
if(weight >= 0.9)
|
|
{
|
|
GNormPlus.PmidLF2Abb_hash.put(pmid+"\t"+LF, SF);
|
|
}
|
|
}
|
|
}
|
|
inputfile.close();
|
|
}
|
|
|
|
|
|
|
|
|
|
public void FeatureExtraction(String FilenameData,String FilenameLoca,String TrainTest) throws XMLStreamException
|
|
{
|
|
try
|
|
{
|
|
|
|
BufferedWriter FileLocation = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(FilenameLoca), "UTF-8"));
|
|
BufferedWriter FileData = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(FilenameData), "UTF-8"));
|
|
|
|
SnowballStemmer stemmer = new englishStemmer();
|
|
|
|
for (int i = 0; i < GNormPlus.BioCDocobj.PMIDs.size(); i++)
|
|
{
|
|
String Pmid = GNormPlus.BioCDocobj.PMIDs.get(i);
|
|
|
|
|
|
for (int j = 0; j < GNormPlus.BioCDocobj.PassageNames.get(i).size(); j++)
|
|
{
|
|
String PassageName= GNormPlus.BioCDocobj.PassageNames.get(i).get(j);
|
|
int PassageOffset = GNormPlus.BioCDocobj.PassageOffsets.get(i).get(j);
|
|
String PassageContext = GNormPlus.BioCDocobj.PassageContexts.get(i).get(j);
|
|
ArrayList<String> Annotation = GNormPlus.BioCDocobj.Annotations.get(i).get(j);
|
|
HashMap<Integer, String> CTDGene_hash = new HashMap<Integer, String>();
|
|
HashMap<Integer, String> FamilyName_hash = new HashMap<Integer, String>();
|
|
HashMap<Integer, String> character_hash = new HashMap<Integer, String>();
|
|
HashMap<Integer, String> Abbreviation_hash = new HashMap<Integer, String>();
|
|
String PassageContext_tmp=" "+PassageContext+" ";
|
|
|
|
|
|
HashMap<Integer, String> Abb_sortebylength = new HashMap<Integer, String>();
|
|
ArrayList<Integer> length_list = new ArrayList<Integer>();
|
|
int countn=0;
|
|
for (Object key : GNormPlus.Pmid2Abb_hash.keySet())
|
|
{
|
|
String pmid2abb[]=key.toString().split("\t");
|
|
if(Pmid.equals(pmid2abb[0]))
|
|
{
|
|
Abb_sortebylength.put(pmid2abb[1].length()*100+countn, pmid2abb[1]);
|
|
length_list.add(pmid2abb[1].length()*100+countn);
|
|
countn++;
|
|
}
|
|
}
|
|
Collections.sort(length_list);
|
|
for (int l=length_list.size()-1;l>=0;l--)
|
|
{
|
|
String AbbLF = Abb_sortebylength.get(length_list.get(l));
|
|
AbbLF=AbbLF.replaceAll("([^A-Za-z0-9@ ])", "\\\\$1");
|
|
AbbLF=AbbLF.replaceAll(" ", "\\[ \\]\\+");
|
|
Pattern ptmp = Pattern.compile("^(.*[^A-Za-z0-9]+)("+AbbLF+")([^A-Za-z0-9]+.*)$");
|
|
Matcher mtmp = ptmp.matcher(PassageContext_tmp);
|
|
while(mtmp.find())
|
|
{
|
|
String str1=mtmp.group(1);
|
|
String str2=mtmp.group(2);
|
|
String str3=mtmp.group(3);
|
|
for(int m=str1.length();m<=(str1.length()+str2.length());m++)
|
|
{
|
|
Abbreviation_hash.put((m-1),GNormPlus.Pmid2Abb_hash.get(Pmid+"\t"+Abb_sortebylength.get(length_list.get(l))));
|
|
}
|
|
String men="";
|
|
for(int m=0;m<str2.length();m++){men=men+"@";}
|
|
PassageContext_tmp=str1+men+str3;
|
|
mtmp = ptmp.matcher(PassageContext_tmp);
|
|
}
|
|
}
|
|
|
|
|
|
ArrayList<String> locations = GNormPlus.PT_CTDGene.SearchMentionLocation(PassageContext,"CTDGene");
|
|
for (int k = 0 ; k < locations.size() ; k++)
|
|
{
|
|
String anno[]=locations.get(k).split("\t");
|
|
int start= Integer.parseInt(anno[0]) + PassageOffset;
|
|
int last= Integer.parseInt(anno[1]) + PassageOffset;
|
|
String mention = anno[2];
|
|
String id = anno[3];
|
|
|
|
CTDGene_hash.put(start,"CTDGene_B");
|
|
CTDGene_hash.put(last,"CTDGene_E");
|
|
for(int s=start+1;s<last;s++)
|
|
{
|
|
CTDGene_hash.put(s,"CTDGene_I");
|
|
}
|
|
}
|
|
|
|
|
|
ArrayList<String> locations_Fname = GNormPlus.PT_FamilyName.SearchMentionLocation(PassageContext,"FamilyName");
|
|
for (int k = 0 ; k < locations_Fname.size() ; k++)
|
|
{
|
|
String anno[]=locations_Fname.get(k).split("\t");
|
|
int start= Integer.parseInt(anno[0]) + PassageOffset;
|
|
int last= Integer.parseInt(anno[1]) + PassageOffset;
|
|
String mention = anno[2];
|
|
String id = anno[3];
|
|
|
|
if(!CTDGene_hash.containsKey(start))
|
|
{
|
|
FamilyName_hash.put(start,"famplex_B");
|
|
FamilyName_hash.put(last,"famplex_E");
|
|
for(int s=start+1;s<last;s++)
|
|
{
|
|
FamilyName_hash.put(s,"famplex_I");
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for (int k = 0; k < Annotation.size() ; k++)
|
|
{
|
|
String anno[]=Annotation.get(k).split("\t");
|
|
int start= Integer.parseInt(anno[0]);
|
|
int last= Integer.parseInt(anno[1]);
|
|
String type = anno[3];
|
|
|
|
character_hash.put(start,type+"_B");
|
|
character_hash.put(last,type+"_E");
|
|
for(int s=start+1;s<last;s++)
|
|
{
|
|
character_hash.put(s,type+"_I");
|
|
}
|
|
}
|
|
|
|
String PassageContext_rev=PassageContext;
|
|
PassageContext_rev = PassageContext_rev.replaceAll("([A-Z][A-Z])([A-Z][0-9][0-9]+[A-Z][\\W\\-\\_])", "$1 $2");
|
|
PassageContext_rev = PassageContext_rev.replaceAll("([0-9])([A-Za-z])", "$1 $2");
|
|
PassageContext_rev = PassageContext_rev.replaceAll("([A-Za-z])([0-9])", "$1 $2");
|
|
|
|
|
|
PassageContext_rev = PassageContext_rev.replaceAll("([\\W])", " $1 ");
|
|
PassageContext_rev = PassageContext_rev.replaceAll("[ ]+", " ");
|
|
PassageContext_tmp = PassageContext;
|
|
int Offset=0;
|
|
String tokens[]=PassageContext_rev.split(" ");
|
|
for(int p=0;p<tokens.length;p++)
|
|
{
|
|
String WSB="WSB:NoGap";
|
|
String WSF="WSF:NoGap";
|
|
while(PassageContext_tmp.substring(0,1).equals(" "))
|
|
{
|
|
PassageContext_tmp=PassageContext_tmp.substring(1);
|
|
Offset++;
|
|
WSB="WSB:Gap";
|
|
}
|
|
if(PassageContext_tmp.length()>tokens[p].length() && PassageContext_tmp.substring(tokens[p].length(),tokens[p].length()+1).equals(" "))
|
|
{
|
|
WSF="WSF:Gap";
|
|
}
|
|
if(p==0)
|
|
{
|
|
WSB="WSB:1st";
|
|
}
|
|
else if(p==tokens.length-1)
|
|
{
|
|
WSF="WSF:last";
|
|
}
|
|
|
|
if(PassageContext_tmp.substring(0,tokens[p].length()).equals(tokens[p]))
|
|
{
|
|
if(tokens[p].length()>0)
|
|
{
|
|
|
|
|
|
|
|
int start=Offset;
|
|
int last=Offset+tokens[p].length();
|
|
String State="";
|
|
if(!character_hash.containsKey(start) || !character_hash.containsKey(last)){}
|
|
else if(character_hash.get(start).matches(".*B$"))
|
|
{
|
|
State=character_hash.get(start);
|
|
}
|
|
else if(character_hash.get(last).matches(".*E$"))
|
|
{
|
|
State=character_hash.get(last);
|
|
}
|
|
else if(character_hash.get(start).matches(".*I$"))
|
|
{
|
|
State=character_hash.get(start);
|
|
}
|
|
|
|
if((!tokens[p].equals("\t")))
|
|
{
|
|
FileLocation.write(Pmid+"\t"+PassageName+"\t"+j+"\t"+tokens[p]+"\t"+(Offset+1)+"\t"+(Offset+tokens[p].length())+"\t"+State+"\n");
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
String Abb_State="__nil__";
|
|
if(!Abbreviation_hash.containsKey(start) || !Abbreviation_hash.containsKey(last)){Abb_State="__nil__";}
|
|
else if(Abbreviation_hash.containsKey(start))
|
|
{
|
|
Abb_State=Abbreviation_hash.get(start);
|
|
}
|
|
|
|
|
|
start=PassageOffset+Offset;
|
|
last=PassageOffset+Offset+tokens[p].length();
|
|
String CTDGene_State="__nil__";
|
|
if(!CTDGene_hash.containsKey(start) || !CTDGene_hash.containsKey(last)){CTDGene_State="__nil__";}
|
|
else if(CTDGene_hash.get(start).matches(".*B$"))
|
|
{
|
|
CTDGene_State=CTDGene_hash.get(start);
|
|
}
|
|
else if(CTDGene_hash.get(last).matches(".*E$"))
|
|
{
|
|
CTDGene_State=CTDGene_hash.get(last);
|
|
}
|
|
else if(CTDGene_hash.get(start).matches(".*I$"))
|
|
{
|
|
CTDGene_State=CTDGene_hash.get(start);
|
|
}
|
|
|
|
|
|
if(CTDGene_State.equals("__nil__"))
|
|
{
|
|
start=PassageOffset+Offset;
|
|
last=PassageOffset+Offset+tokens[p].length();
|
|
if(!FamilyName_hash.containsKey(start) || !FamilyName_hash.containsKey(last)){}
|
|
else if(FamilyName_hash.get(start).matches(".*B$"))
|
|
{
|
|
CTDGene_State=FamilyName_hash.get(start);
|
|
}
|
|
else if(FamilyName_hash.get(last).matches(".*E$"))
|
|
{
|
|
CTDGene_State=FamilyName_hash.get(last);
|
|
}
|
|
else if(FamilyName_hash.get(start).matches(".*I$"))
|
|
{
|
|
CTDGene_State=FamilyName_hash.get(start);
|
|
}
|
|
}
|
|
|
|
|
|
stemmer.setCurrent(tokens[p].toLowerCase());
|
|
stemmer.stem();
|
|
String stem=stemmer.getCurrent();
|
|
|
|
|
|
String Num_num="";
|
|
String tmp=tokens[p];
|
|
tmp=tmp.replaceAll("[^0-9]","");
|
|
if(tmp.length()>3){Num_num="N:4+";}else{Num_num="N:"+ tmp.length();}
|
|
|
|
|
|
String Num_Uc="";
|
|
tmp=tokens[p];
|
|
tmp=tmp.replaceAll("[^A-Z]","");
|
|
if(tmp.length()>3){Num_Uc="U:4+";}else{Num_Uc="U:"+ tmp.length();}
|
|
|
|
|
|
String Num_lc="";
|
|
tmp=tokens[p];
|
|
tmp=tmp.replaceAll("[^a-z]","");
|
|
if(tmp.length()>3){Num_lc="L:4+";}else{Num_lc="L:"+ tmp.length();}
|
|
|
|
|
|
String Num_All="";
|
|
if(tokens[p].length()>3){Num_All="A:4+";}else{Num_All="A:"+ tokens[p].length();}
|
|
|
|
|
|
String SpecificC="__nil__";
|
|
if(tokens[p].equals(";") || tokens[p].equals(":") || tokens[p].equals(",") || tokens[p].equals(".") || tokens[p].equals("-") || tokens[p].equals(">") || tokens[p].equals("+") || tokens[p].equals("_"))
|
|
{
|
|
SpecificC="-SpecificC1-";
|
|
}
|
|
else if(tokens[p].equals("(") || tokens[p].equals(")"))
|
|
{
|
|
SpecificC="-SpecificC2-";
|
|
}
|
|
else if(tokens[p].equals("{") || tokens[p].equals("}"))
|
|
{
|
|
SpecificC="-SpecificC3-";
|
|
}
|
|
else if(tokens[p].equals("[") || tokens[p].equals("]"))
|
|
{
|
|
SpecificC="-SpecificC4-";
|
|
}
|
|
else if(tokens[p].equals("\\") || tokens[p].equals("/"))
|
|
{
|
|
SpecificC="-SpecificC5-";
|
|
}
|
|
|
|
|
|
String ChemPreSuf="__nil__";
|
|
if(tokens[p].matches(".*(yl|ylidyne|oyl|sulfonyl)")){ChemPreSuf="-CHEMinlineSuffix-";}
|
|
else if(tokens[p].matches("(meth|eth|prop|tetracos).*")){ChemPreSuf="-CHEMalkaneStem-";}
|
|
else if(tokens[p].matches("(di|tri|tetra).*")){ChemPreSuf="-CHEMsimpleMultiplier-";}
|
|
else if(tokens[p].matches("(benzen|pyridin|toluen).*")){ChemPreSuf="-CHEMtrivialRing-";}
|
|
else if(tokens[p].matches(".*(one|ol|carboxylic|amide|ate|acid|ium|ylium|ide|uide|iran|olan|inan|pyrid|acrid|amid|keten|formazan|fydrazin)(s|)")){ChemPreSuf="-CHEMsuffix-";}
|
|
|
|
|
|
|
|
String MentionType="__nil__";
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if(tokens[p].matches("(ytochrome|cytochrome)")){MentionType="-Type_cytochrome-";}
|
|
else if(tokens[p].matches(".*target") ){MentionType="-Type_target-";}
|
|
else if(tokens[p].matches(".*(irradiation|hybrid|fusion|experiment|gst|est|gap|antigen)") ){MentionType="-Type_ExperimentNoun-";}
|
|
else if(tokens[p].matches(".*(disease|disorder|dystrophy|deficiency|syndrome|dysgenesis|cancer|injury|neoplasm|diabetes|diabete)") ){MentionType="-Type_Disease-";}
|
|
else if(tokens[p].matches(".*(motif|domain|omain|binding|site|region|sequence|frameshift|finger|box).*") ){MentionType="-Type_DomainMotif-";}
|
|
else if(tokens[p].equals("-") && (p<tokens.length-1 && tokens[p+1].matches(".*(motif|domain|omain|binding|site|region|sequence|frameshift|finger|box).*")) ){MentionType="-Type_DomainMotif-";}
|
|
else if(tokens[p].matches("[rmc]") && (p<tokens.length-1 && (tokens[p+1].equals("DNA") || tokens[p+1].equals("RNA")) ) ){MentionType="-Type_DomainMotif-";}
|
|
else if(tokens[p].matches(".*(famil|complex|cluster|proteins|genes|factors|transporter|proteinase|membrane|ligand|enzyme|channels|tors$|ase$|ases$)") ){MentionType="-Type_Family-";}
|
|
else if(tokens[p].toLowerCase().matches("^marker") ){MentionType="-Type_Marker-";}
|
|
else if(tokens[p].equals(".*cell.*") || (p<tokens.length-1 && tokens[p+1].equals("cell") && tokens[p].matches("^(T|B|monocytic|cancer|tumor|myeloma|epithelial|crypt)$") ) ){MentionType="-Type_Cell-";}
|
|
else if(tokens[p].equals(".*chromosome.*") ){MentionType="-Type_Chromosome-";}
|
|
else if(tokens[p].matches("[pq]") && ( (p<tokens.length-1 && tokens[p+1].matches("^[0-9]+$")) || (p>0 && tokens[p-1].matches("^[0-9]+$")) ) ){MentionType="-Type_ChromosomeStrain-";}
|
|
else if(tokens[p].matches(".*(related|regulated|associated|correlated|reactive).*")){MentionType="-Type_relation-";}
|
|
else if(tokens[p].toLowerCase().matches(".*(polymorphism|mutation|deletion|insertion|duplication|genotype|genotypes).*") ){MentionType="-Type_VariationTerms-";}
|
|
else if(tokens[p].matches(".*(oxidase|transferase|transferases|kinase|kinese|subunit|unit|receptor|adrenoceptor|transporter|regulator|transcription|antigen|protein|gene|factor|member|molecule|channel|deaminase|spectrin).*") ){MentionType="-Type_suffix-";}
|
|
else if(tokens[p].matches("[\\(\\-\\_]") && (p<tokens.length-1 && tokens[p+1].toLowerCase().matches(".*(alpha|beta|gamma|delta|theta|kappa|zeta|sigma|omega|i|ii|iii|iv|v|vi|[abcdefgyr])")) ){MentionType="-Type_strain-";}
|
|
else if(tokens[p].matches("(alpha|beta|gamma|delta|theta|kappa|zeta|sigma|omega|i|ii|iii|iv|v|vi|[abcdefgyr])") ){MentionType="-Type_strain-";}
|
|
|
|
|
|
String ProteinSym="__nil__";
|
|
if(tokens[p].matches(".*(glutamine|glutamic|leucine|valine|isoleucine|lysine|alanine|glycine|aspartate|methionine|threonine|histidine|aspartic|asparticacid|arginine|asparagine|tryptophan|proline|phenylalanine|cysteine|serine|glutamate|tyrosine|stop|frameshift).*")){ProteinSym="-ProteinSymFull-";}
|
|
else if(tokens[p].matches("(cys|ile|ser|gln|met|asn|pro|lys|asp|thr|phe|ala|gly|his|leu|arg|trp|val|glu|tyr|fs|fsx)")){ProteinSym="-ProteinSymTri-";}
|
|
else if(tokens[p].matches("[CISQMNPKDTFAGHLRWVEYX]")){ProteinSym="-ProteinSymChar-";}
|
|
|
|
|
|
String prefix="";
|
|
tmp=tokens[p];
|
|
if(tmp.length()>=1){ prefix=tmp.substring(0, 1);}else{prefix="__nil__";}
|
|
if(tmp.length()>=2){ prefix=prefix+" "+tmp.substring(0, 2);}else{prefix=prefix+" __nil__";}
|
|
if(tmp.length()>=3){ prefix=prefix+" "+tmp.substring(0, 3);}else{prefix=prefix+" __nil__";}
|
|
if(tmp.length()>=4){ prefix=prefix+" "+tmp.substring(0, 4);}else{prefix=prefix+" __nil__";}
|
|
if(tmp.length()>=5){ prefix=prefix+" "+tmp.substring(0, 5);}else{prefix=prefix+" __nil__";}
|
|
|
|
|
|
|
|
String suffix="";
|
|
tmp=tokens[p];
|
|
if(tmp.length()>=1){ suffix=tmp.substring(tmp.length()-1, tmp.length());}else{suffix="__nil__";}
|
|
if(tmp.length()>=2){ suffix=suffix+" "+tmp.substring(tmp.length()-2, tmp.length());}else{suffix=suffix+" __nil__";}
|
|
if(tmp.length()>=3){ suffix=suffix+" "+tmp.substring(tmp.length()-3, tmp.length());}else{suffix=suffix+" __nil__";}
|
|
if(tmp.length()>=4){ suffix=suffix+" "+tmp.substring(tmp.length()-4, tmp.length());}else{suffix=suffix+" __nil__";}
|
|
if(tmp.length()>=5){ suffix=suffix+" "+tmp.substring(tmp.length()-5, tmp.length());}else{suffix=suffix+" __nil__";}
|
|
|
|
if(State.equals(""))
|
|
{
|
|
State="O";
|
|
}
|
|
|
|
if((!tokens[p].equals("\t")))
|
|
{
|
|
if(TrainTest.equals("Train"))
|
|
{
|
|
FileData.write(tokens[p]+" "+stem+" "+WSB+" "+WSF+" "+Num_num+" "+Num_Uc+" "+Num_lc+" "+Num_All+" "+SpecificC+" "+ChemPreSuf+" "+MentionType+" "+ProteinSym+" "+prefix+" "+suffix+" "+CTDGene_State+" "+Abb_State+" "+State+"\n");
|
|
}
|
|
else
|
|
{
|
|
FileData.write(tokens[p]+" "+stem+" "+WSB+" "+WSF+" "+Num_num+" "+Num_Uc+" "+Num_lc+" "+Num_All+" "+SpecificC+" "+ChemPreSuf+" "+MentionType+" "+ProteinSym+" "+prefix+" "+suffix+" "+CTDGene_State+" "+Abb_State+"\n");
|
|
}
|
|
}
|
|
PassageContext_tmp=PassageContext_tmp.substring(tokens[p].length());
|
|
Offset=Offset+tokens[p].length();
|
|
}
|
|
}
|
|
}
|
|
if(tokens.length>0)
|
|
{
|
|
FileLocation.write("\n");
|
|
FileData.write("\n");
|
|
}
|
|
}
|
|
}
|
|
FileLocation.close();
|
|
FileData.close();
|
|
}
|
|
catch(IOException e1){ System.out.println("[MR]: Input file is not exist.");}
|
|
}
|
|
|
|
|
|
|
|
public void CRF_test(String model, String FilenameData, String FilenameOutput) throws IOException
|
|
{
|
|
File f = new File(FilenameOutput);
|
|
BufferedWriter fr = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(f), "UTF-8"));
|
|
|
|
Runtime runtime = Runtime.getRuntime();
|
|
|
|
String OS=System.getProperty("os.name").toLowerCase();
|
|
|
|
String cmd="./CRF/crf_test -m "+model+" -o "+FilenameOutput+" "+FilenameData;
|
|
if(OS.contains("windows"))
|
|
{
|
|
cmd ="CRF/crf_test -m "+model+" -o "+FilenameOutput+" "+FilenameData;
|
|
}
|
|
else
|
|
{
|
|
cmd ="./CRF/crf_test -m "+model+" -o "+FilenameOutput+" "+FilenameData;
|
|
}
|
|
|
|
try {
|
|
Process process = runtime.exec(cmd);
|
|
InputStream is = process.getInputStream();
|
|
InputStreamReader isr = new InputStreamReader(is, "UTF-8");
|
|
BufferedReader br = new BufferedReader(isr);
|
|
String line="";
|
|
while ( (line = br.readLine()) != null)
|
|
{
|
|
fr.write(line);
|
|
fr.newLine();
|
|
fr.flush();
|
|
}
|
|
is.close();
|
|
isr.close();
|
|
br.close();
|
|
fr.close();
|
|
}
|
|
catch (IOException e) {
|
|
System.out.println(e);
|
|
runtime.exit(0);
|
|
}
|
|
}
|
|
|
|
public void CRF_test(String model,String FilenameData,String FilenameOutput,String top3) throws IOException
|
|
{
|
|
File f = new File(FilenameOutput);
|
|
BufferedWriter fr = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(f), "UTF-8"));
|
|
|
|
Runtime runtime = Runtime.getRuntime();
|
|
|
|
String OS=System.getProperty("os.name").toLowerCase();
|
|
|
|
String cmd="./CRF/crf_test -n 3 -m "+model+" -o "+FilenameOutput+" "+FilenameData;
|
|
if(OS.contains("windows"))
|
|
{
|
|
cmd ="CRF/crf_test -n 3 -m "+model+" -o "+FilenameOutput+" "+FilenameData;
|
|
}
|
|
else
|
|
{
|
|
cmd ="./CRF/crf_test -n 3 -m "+model+" -o "+FilenameOutput+" "+FilenameData;
|
|
}
|
|
|
|
try {
|
|
Process process = runtime.exec(cmd);
|
|
InputStream is = process.getInputStream();
|
|
InputStreamReader isr = new InputStreamReader(is, "UTF-8");
|
|
BufferedReader br = new BufferedReader(isr);
|
|
String line="";
|
|
while ( (line = br.readLine()) != null)
|
|
{
|
|
fr.write(line);
|
|
fr.newLine();
|
|
fr.flush();
|
|
}
|
|
is.close();
|
|
isr.close();
|
|
br.close();
|
|
fr.close();
|
|
}
|
|
catch (IOException e) {
|
|
System.out.println(e);
|
|
runtime.exit(0);
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
public void CRF_learn(String model, String FilenameData) throws IOException
|
|
{
|
|
Runtime runtime = Runtime.getRuntime();
|
|
|
|
Process process = null;
|
|
String line = null;
|
|
InputStream is = null;
|
|
InputStreamReader isr = null;
|
|
BufferedReader br = null;
|
|
|
|
String OS=System.getProperty("os.name").toLowerCase();
|
|
|
|
String cmd="./CRF/crf_learn -f 3 -c 4.0 CRF/template_UB "+FilenameData+" "+model;
|
|
if(OS.contains("windows"))
|
|
{
|
|
cmd ="CRF/crf_learn -f 3 -c 4.0 CRF/template_UB "+FilenameData+" "+model;
|
|
}
|
|
else
|
|
{
|
|
cmd ="./CRF/crf_learn -f 3 -c 4.0 CRF/template_UB "+FilenameData+" "+model;
|
|
}
|
|
|
|
try {
|
|
process = runtime.exec(cmd);
|
|
is = process.getInputStream();
|
|
isr = new InputStreamReader(is, "UTF-8");
|
|
br = new BufferedReader(isr);
|
|
while ( (line = br.readLine()) != null)
|
|
{
|
|
System.out.println(line);
|
|
System.out.flush();
|
|
}
|
|
is.close();
|
|
isr.close();
|
|
br.close();
|
|
}
|
|
catch (IOException e) {
|
|
System.out.println(e);
|
|
runtime.exit(0);
|
|
}
|
|
}
|
|
|
|
public void ReadCRFresult(String Filename,String FilenameLoca,String FilenameOutput,String FilenameBioC) throws XMLStreamException, IOException
|
|
{
|
|
|
|
ArrayList<String> outputArr = new ArrayList<String>();
|
|
BufferedReader inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(FilenameOutput), "UTF-8"));
|
|
String line;
|
|
while ((line = inputfile.readLine()) != null)
|
|
{
|
|
outputArr.add(line);
|
|
}
|
|
inputfile.close();
|
|
|
|
|
|
ArrayList<String> locationArr = new ArrayList<String>();
|
|
inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(FilenameLoca), "UTF-8"));
|
|
while ((line = inputfile.readLine()) != null)
|
|
{
|
|
locationArr.add(line);
|
|
}
|
|
inputfile.close();
|
|
|
|
|
|
String pmid_last="";
|
|
String paragraph_num_last="";
|
|
String pmid="";
|
|
String paragraph="";
|
|
String paragraph_num="";
|
|
Pattern pat_B = Pattern.compile("((FamilyName|DomainMotif|Gene)_[B])$");
|
|
Pattern pat_IE = Pattern.compile("((FamilyName|DomainMotif|Gene)_[IE])$");
|
|
ArrayList<ArrayList<String>> AnnotationInPMID = new ArrayList();
|
|
ArrayList<String> AnnotationInPassage= new ArrayList<String>();
|
|
GNormPlus.BioCDocobj.Annotations = new ArrayList();
|
|
int countPMID=0;
|
|
int countPassage=0;
|
|
|
|
for(int i=0;i<outputArr.size();i++)
|
|
{
|
|
String outputsRow[]=outputArr.get(i).split("\\t");
|
|
String locationRow[]=locationArr.get(i).split("\\t");
|
|
int start=100000;
|
|
int last=0;
|
|
String MentionType="";
|
|
|
|
if(locationRow.length>3)
|
|
{
|
|
pmid=locationRow[0];
|
|
paragraph=locationRow[1];
|
|
paragraph_num=locationRow[2];
|
|
}
|
|
|
|
if( (!paragraph_num_last.equals("")) && (!paragraph_num.equals(paragraph_num_last)) )
|
|
{
|
|
AnnotationInPMID.add(AnnotationInPassage);
|
|
AnnotationInPassage = new ArrayList<String>();
|
|
countPassage++;
|
|
}
|
|
if( (!pmid_last.equals("")) && (!pmid.equals(pmid_last)) )
|
|
{
|
|
GNormPlus.BioCDocobj.Annotations.add(AnnotationInPMID);
|
|
AnnotationInPMID = new ArrayList();
|
|
countPMID++;
|
|
countPassage=0;
|
|
}
|
|
|
|
boolean F = false;
|
|
if(locationRow.length>2)
|
|
{
|
|
Matcher mat = pat_B.matcher(outputsRow[outputsRow.length-1]);
|
|
while(mat.find() && locationRow.length==6)
|
|
{
|
|
MentionType=mat.group(2);
|
|
pmid=locationRow[0];
|
|
paragraph_num=locationRow[2];
|
|
int start_tmp=Integer.parseInt(locationRow[4])-1;
|
|
int last_tmp=Integer.parseInt(locationRow[5]);
|
|
if(start_tmp<start){start=start_tmp;}
|
|
if(last_tmp>last){last=last_tmp;}
|
|
i++;
|
|
F = true;
|
|
if(locationArr.get(i).length()>0)
|
|
{
|
|
outputsRow=outputArr.get(i).split("\\t");
|
|
locationRow=locationArr.get(i).split("\\t");
|
|
mat = pat_IE.matcher(outputsRow[outputsRow.length-1]);
|
|
}
|
|
else
|
|
{
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
if(F == true)
|
|
{
|
|
String PassageContext = GNormPlus.BioCDocobj.PassageContexts.get(countPMID).get(countPassage);
|
|
String Mention = PassageContext.substring(start, last);
|
|
String Mention_nospace = Mention.replaceAll("[\\W\\-\\_]", "");
|
|
if(Mention.toLowerCase().matches("(figure|tables|fig|tab|exp\\. [0-9]+).*")){}
|
|
else if(Mention.matches("[A-Z][A-Z]s")){}
|
|
else if(Mention.matches(".*\\|.*")){}
|
|
else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\;\\,\\'\\/\\\\].*")){}
|
|
else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\(].*") && !Mention.matches(".*[\\)].*")){}
|
|
else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\[].*") && !Mention.matches(".*[\\]].*")){}
|
|
else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\)].*") && !Mention.matches(".*[\\(].*")){}
|
|
else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\]].*") && !Mention.matches(".*[\\[].*")){}
|
|
else
|
|
{
|
|
AnnotationInPassage.add(start+"\t"+last+"\t"+Mention+"\t"+MentionType);
|
|
}
|
|
i--;
|
|
}
|
|
|
|
paragraph_num_last=paragraph_num;
|
|
pmid_last=pmid;
|
|
}
|
|
AnnotationInPMID.add(AnnotationInPassage);
|
|
GNormPlus.BioCDocobj.Annotations.add(AnnotationInPMID);
|
|
|
|
|
|
}
|
|
|
|
public void ReadCRFresult(String Filename,String FilenameLoca,String FilenameOutput,String FilenameBioC,double threshold,double threshold_GeneType) throws XMLStreamException, IOException
|
|
{
|
|
|
|
ArrayList<String> outputArr1 = new ArrayList<String>();
|
|
ArrayList<String> outputArr2 = new ArrayList<String>();
|
|
ArrayList<String> outputArr3 = new ArrayList<String>();
|
|
ArrayList<String> outputArr1_score = new ArrayList<String>();
|
|
ArrayList<String> outputArr2_score = new ArrayList<String>();
|
|
ArrayList<String> outputArr3_score = new ArrayList<String>();
|
|
BufferedReader inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(FilenameOutput), "UTF-8"));
|
|
String line;
|
|
int rank=0;
|
|
String score="";
|
|
Pattern pat_Rank = Pattern.compile("^# ([0-2]) ([0-9\\.]+)$");
|
|
while ((line = inputfile.readLine()) != null)
|
|
{
|
|
Matcher mat = pat_Rank.matcher(line);
|
|
if(mat.find())
|
|
{
|
|
rank = Integer.parseInt(mat.group(1));
|
|
score = mat.group(2);
|
|
}
|
|
else if(rank == 0)
|
|
{
|
|
outputArr1.add(line);
|
|
outputArr1_score.add(score);
|
|
}
|
|
else if(rank == 1)
|
|
{
|
|
outputArr2.add(line);
|
|
outputArr2_score.add(score);
|
|
}
|
|
else if(rank == 2)
|
|
{
|
|
outputArr3.add(line);
|
|
outputArr3_score.add(score);
|
|
}
|
|
}
|
|
inputfile.close();
|
|
|
|
|
|
ArrayList<String> locationArr = new ArrayList<String>();
|
|
inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(FilenameLoca), "UTF-8"));
|
|
while ((line = inputfile.readLine()) != null)
|
|
{
|
|
locationArr.add(line);
|
|
}
|
|
inputfile.close();
|
|
|
|
|
|
String pmid_last="";
|
|
String paragraph_num_last="";
|
|
String pmid="";
|
|
String paragraph="";
|
|
String paragraph_num="";
|
|
Pattern pat_B = Pattern.compile("((FamilyName|DomainMotif|Gene)_[B])$");
|
|
Pattern pat_IE = Pattern.compile("((FamilyName|DomainMotif|Gene)_[IE])$");
|
|
ArrayList<ArrayList<String>> AnnotationInPMID = new ArrayList();
|
|
ArrayList<String> AnnotationInPassage= new ArrayList<String>();
|
|
GNormPlus.BioCDocobj.Annotations = new ArrayList();
|
|
int countPMID=0;
|
|
int countPassage=0;
|
|
|
|
int size_Arr=outputArr1.size();
|
|
if(locationArr.size()<outputArr1.size())
|
|
{
|
|
size_Arr=locationArr.size();
|
|
}
|
|
for(int i=0;i<size_Arr;i++)
|
|
{
|
|
String outputsRow[]=outputArr1.get(i).split("\\t");
|
|
String locationRow[]=locationArr.get(i).split("\\t");
|
|
int start=100000;
|
|
int last=0;
|
|
String MentionType="";
|
|
if(locationRow.length>3)
|
|
{
|
|
pmid=locationRow[0];
|
|
paragraph=locationRow[1];
|
|
paragraph_num=locationRow[2];
|
|
}
|
|
|
|
boolean F = false;
|
|
if(outputsRow.length>=1)
|
|
{
|
|
Matcher mat = pat_B.matcher(outputsRow[outputsRow.length-1]);
|
|
while(mat.find() && locationRow.length==6)
|
|
{
|
|
MentionType=mat.group(2);
|
|
pmid=locationRow[0];
|
|
int start_tmp=Integer.parseInt(locationRow[4])-1;
|
|
int last_tmp=Integer.parseInt(locationRow[5]);
|
|
if(start_tmp<start){start=start_tmp;}
|
|
if(last_tmp>last){last=last_tmp;}
|
|
i++;
|
|
outputsRow=outputArr1.get(i).split("\\t");
|
|
locationRow=locationArr.get(i).split("\\t");
|
|
mat = pat_IE.matcher(outputsRow[outputsRow.length-1]);
|
|
F = true;
|
|
}
|
|
}
|
|
|
|
if( (!paragraph_num_last.equals("")) && (!paragraph_num.equals(paragraph_num_last)) )
|
|
{
|
|
AnnotationInPMID.add(AnnotationInPassage);
|
|
AnnotationInPassage = new ArrayList<String>();
|
|
countPassage++;
|
|
}
|
|
|
|
if( !pmid.equals(pmid_last) && paragraph_num.equals("0") && paragraph_num_last.equals("0") )
|
|
{
|
|
AnnotationInPMID.add(AnnotationInPassage);
|
|
AnnotationInPassage = new ArrayList<String>();
|
|
GNormPlus.BioCDocobj.Annotations.add(AnnotationInPMID);
|
|
AnnotationInPMID = new ArrayList();
|
|
countPMID++;
|
|
countPassage=0;
|
|
}
|
|
else if( (!pmid_last.equals("")) && (!pmid.equals(pmid_last)) )
|
|
{
|
|
GNormPlus.BioCDocobj.Annotations.add(AnnotationInPMID);
|
|
AnnotationInPMID = new ArrayList();
|
|
countPMID++;
|
|
countPassage=0;
|
|
}
|
|
|
|
if(F == true)
|
|
{
|
|
if(GNormPlus.BioCDocobj.PassageContexts.size()>countPMID && GNormPlus.BioCDocobj.PassageContexts.get(countPMID).size()>countPassage && GNormPlus.BioCDocobj.PassageContexts.get(countPMID).get(countPassage).length()>=last && (last-start)<1000)
|
|
{
|
|
String PassageContext = GNormPlus.BioCDocobj.PassageContexts.get(countPMID).get(countPassage);
|
|
String Mention = PassageContext.substring(start, last);
|
|
String Mention_nospace = Mention.replaceAll("[\\W\\-\\_]", "");
|
|
if(Mention.toLowerCase().matches("(figure|tables|fig|tab|exp\\. [0-9]+).*")){}
|
|
else if(Mention.matches("[A-Z][A-Z]s")){}
|
|
else if(Mention.matches(".*\\|.*")){}
|
|
else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\;\\,\\'\\/\\\\].*")){}
|
|
else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\(].*") && !Mention.matches(".*[\\)].*")){}
|
|
else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\[].*") && !Mention.matches(".*[\\]].*")){}
|
|
else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\)].*") && !Mention.matches(".*[\\(].*")){}
|
|
else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\]].*") && !Mention.matches(".*[\\[].*")){}
|
|
else if((GNormPlus.Abb2Longformtok_hash.containsKey(Mention_nospace.toLowerCase())) && (PassageContext.toLowerCase().matches(".*[\\W\\-\\-]("+GNormPlus.Abb2Longformtok_hash.get(Mention_nospace.toLowerCase())+")[\\W\\-\\-].*")))
|
|
{
|
|
|
|
}
|
|
else
|
|
{
|
|
AnnotationInPassage.add(start+"\t"+last+"\t"+Mention+"\t"+MentionType);
|
|
}
|
|
}
|
|
i--;
|
|
}
|
|
paragraph_num_last=paragraph_num;
|
|
pmid_last=pmid;
|
|
}
|
|
AnnotationInPMID.add(AnnotationInPassage);
|
|
GNormPlus.BioCDocobj.Annotations.add(AnnotationInPMID);
|
|
|
|
|
|
pmid_last="";
|
|
paragraph_num_last="";
|
|
pmid="";
|
|
paragraph="";
|
|
paragraph_num="";
|
|
countPMID=0;
|
|
countPassage=0;
|
|
size_Arr=outputArr2.size();
|
|
if(locationArr.size()<outputArr2.size())
|
|
{
|
|
size_Arr=locationArr.size();
|
|
}
|
|
for(int i=0;i<size_Arr;i++)
|
|
{
|
|
String outputsRow[]=outputArr2.get(i).split("\\t");
|
|
String locationRow[]=locationArr.get(i).split("\\t");
|
|
int start=100000;
|
|
int last=0;
|
|
String MentionType="";
|
|
if(locationRow.length>2)
|
|
{
|
|
pmid=locationRow[0];
|
|
paragraph=locationRow[1];
|
|
paragraph_num=locationRow[2];
|
|
}
|
|
|
|
boolean F = false;
|
|
if(outputsRow.length>=1)
|
|
{
|
|
Matcher mat = pat_B.matcher(outputsRow[outputsRow.length-1]);
|
|
while(mat.find() && locationRow.length==6)
|
|
{
|
|
MentionType=mat.group(2);
|
|
pmid=locationRow[0];
|
|
int start_tmp=Integer.parseInt(locationRow[4])-1;
|
|
int last_tmp=Integer.parseInt(locationRow[5]);
|
|
if(start_tmp<start){start=start_tmp;}
|
|
if(last_tmp>last){last=last_tmp;}
|
|
i++;
|
|
outputsRow=outputArr2.get(i).split("\\t");
|
|
locationRow=locationArr.get(i).split("\\t");
|
|
mat = pat_IE.matcher(outputsRow[outputsRow.length-1]);
|
|
F = true;
|
|
}
|
|
}
|
|
|
|
if( (!paragraph_num_last.equals("")) && (!paragraph_num.equals(paragraph_num_last)) )
|
|
{
|
|
countPassage++;
|
|
}
|
|
|
|
if( !pmid.equals(pmid_last) && paragraph_num.equals("0") && paragraph_num_last.equals("0") )
|
|
{
|
|
countPMID++;
|
|
countPassage=0;
|
|
}
|
|
else if( (!pmid_last.equals("")) && (!pmid.equals(pmid_last)) )
|
|
{
|
|
countPMID++;
|
|
countPassage=0;
|
|
}
|
|
|
|
if(F == true)
|
|
{
|
|
if(GNormPlus.BioCDocobj.PassageContexts.size()>countPMID && GNormPlus.BioCDocobj.PassageContexts.get(countPMID).size()>countPassage && GNormPlus.BioCDocobj.PassageContexts.get(countPMID).get(countPassage).length()>=last && (last-start)<1000)
|
|
{
|
|
String PassageContext = GNormPlus.BioCDocobj.PassageContexts.get(countPMID).get(countPassage);
|
|
String Mention = PassageContext.substring(start, last);
|
|
String Mention_nospace = Mention.replaceAll("[\\W\\-\\_]", "");
|
|
if(Mention.toLowerCase().matches("(figure|tables|fig|tab|exp\\. [0-9]+).*")){}
|
|
else if(Mention.matches("[A-Z][A-Z]s")){}
|
|
else if(Mention.matches(".*\\|.*")){}
|
|
else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\;\\,\\'\\/\\\\].*")){}
|
|
else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\(].*") && !Mention.matches(".*[\\)].*")){}
|
|
else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\[].*") && !Mention.matches(".*[\\]].*")){}
|
|
else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\)].*") && !Mention.matches(".*[\\(].*")){}
|
|
else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\]].*") && !Mention.matches(".*[\\[].*")){}
|
|
else if((GNormPlus.Abb2Longformtok_hash.containsKey(Mention_nospace.toLowerCase())) && (PassageContext.toLowerCase().matches(".*[\\W\\-\\-]("+GNormPlus.Abb2Longformtok_hash.get(Mention_nospace.toLowerCase())+")[\\W\\-\\-].*")))
|
|
{
|
|
|
|
}
|
|
else if(Double.parseDouble(outputArr2_score.get(i))>threshold)
|
|
{
|
|
boolean overlap=false;
|
|
for(int j=0;j<GNormPlus.BioCDocobj.Annotations.get(countPMID).get(countPassage).size();j++)
|
|
{
|
|
String GetData[]=GNormPlus.BioCDocobj.Annotations.get(countPMID).get(countPassage).get(j).split("\t");
|
|
int startj=Integer.parseInt(GetData[0]);
|
|
int lastj=Integer.parseInt(GetData[1]);
|
|
String Mention_tmp = Mention.replaceAll("([^A-Za-z0-9@ ])", "\\\\$1");
|
|
if(MentionType.equals("Gene") && Double.parseDouble(outputArr2_score.get(i))>threshold_GeneType && GNormPlus.BioCDocobj.Annotations.get(countPMID).get(countPassage).get(j).matches(start+"\t"+last+"\t"+Mention_tmp+"\t(FamilyName|DomainMotif)") )
|
|
{
|
|
GNormPlus.BioCDocobj.Annotations.get(countPMID).get(countPassage).set(j, start+"\t"+last+"\t"+Mention+"\t"+MentionType);
|
|
}
|
|
else if( (start>=startj && start<lastj) || (last>startj && last<=lastj) )
|
|
{
|
|
overlap=true;
|
|
}
|
|
}
|
|
if(overlap == false)
|
|
{
|
|
GNormPlus.BioCDocobj.Annotations.get(countPMID).get(countPassage).add(start+"\t"+last+"\t"+Mention+"\t"+MentionType);
|
|
}
|
|
}
|
|
}
|
|
i--;
|
|
}
|
|
|
|
paragraph_num_last=paragraph_num;
|
|
pmid_last=pmid;
|
|
}
|
|
|
|
|
|
pmid_last="";
|
|
paragraph_num_last="";
|
|
pmid="";
|
|
paragraph="";
|
|
paragraph_num="";
|
|
countPMID=0;
|
|
countPassage=0;
|
|
size_Arr=outputArr3.size();
|
|
if(locationArr.size()<outputArr3.size())
|
|
{
|
|
size_Arr=locationArr.size();
|
|
}
|
|
for(int i=0;i<size_Arr;i++)
|
|
{
|
|
String outputsRow[]=outputArr3.get(i).split("\\t");
|
|
String locationRow[]=locationArr.get(i).split("\\t");
|
|
int start=100000;
|
|
int last=0;
|
|
String MentionType="";
|
|
if(locationRow.length>2)
|
|
{
|
|
pmid=locationRow[0];
|
|
paragraph=locationRow[1];
|
|
paragraph_num=locationRow[2];
|
|
}
|
|
|
|
boolean F = false;
|
|
if(outputsRow.length>=1)
|
|
{
|
|
Matcher mat = pat_B.matcher(outputsRow[outputsRow.length-1]);
|
|
while(mat.find() && locationRow.length==6)
|
|
{
|
|
MentionType=mat.group(2);
|
|
pmid=locationRow[0];
|
|
paragraph_num=locationRow[2];
|
|
int start_tmp=Integer.parseInt(locationRow[4])-1;
|
|
int last_tmp=Integer.parseInt(locationRow[5]);
|
|
if(start_tmp<start){start=start_tmp;}
|
|
if(last_tmp>last){last=last_tmp;}
|
|
i++;
|
|
outputsRow=outputArr3.get(i).split("\\t");
|
|
locationRow=locationArr.get(i).split("\\t");
|
|
mat = pat_IE.matcher(outputsRow[outputsRow.length-1]);
|
|
F = true;
|
|
}
|
|
}
|
|
|
|
if( (!paragraph_num_last.equals("")) && (!paragraph_num.equals(paragraph_num_last)) )
|
|
{
|
|
countPassage++;
|
|
}
|
|
|
|
if( !pmid.equals(pmid_last) && paragraph_num.equals("0") && paragraph_num_last.equals("0") )
|
|
{
|
|
countPMID++;
|
|
countPassage=0;
|
|
}
|
|
else if( (!pmid_last.equals("")) && (!pmid.equals(pmid_last)) )
|
|
{
|
|
countPMID++;
|
|
countPassage=0;
|
|
}
|
|
|
|
if(F == true)
|
|
{
|
|
if(GNormPlus.BioCDocobj.PassageContexts.size()>countPMID && GNormPlus.BioCDocobj.PassageContexts.get(countPMID).size()>countPassage && GNormPlus.BioCDocobj.PassageContexts.get(countPMID).get(countPassage).length()>=last && (last-start)<1000)
|
|
{
|
|
String PassageContext = GNormPlus.BioCDocobj.PassageContexts.get(countPMID).get(countPassage);
|
|
String Mention = PassageContext.substring(start, last);
|
|
String Mention_nospace = Mention.replaceAll("[\\W\\-\\_]", "");
|
|
if(Mention.toLowerCase().matches("(figure|tables|fig|tab|exp\\. [0-9]+).*")){}
|
|
else if(Mention.matches("[A-Z][A-Z]s")){}
|
|
else if(Mention.matches(".*\\|.*")){}
|
|
else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\;\\,\\'\\/\\\\].*")){}
|
|
else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\(].*") && !Mention.matches(".*[\\)].*")){}
|
|
else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\[].*") && !Mention.matches(".*[\\]].*")){}
|
|
else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\)].*") && !Mention.matches(".*[\\(].*")){}
|
|
else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\]].*") && !Mention.matches(".*[\\[].*")){}
|
|
else if((GNormPlus.Abb2Longformtok_hash.containsKey(Mention_nospace.toLowerCase())) && (PassageContext.toLowerCase().matches(".*[\\W\\-\\-]("+GNormPlus.Abb2Longformtok_hash.get(Mention_nospace.toLowerCase())+")[\\W\\-\\-].*")))
|
|
{
|
|
|
|
}
|
|
else if(Double.parseDouble(outputArr3_score.get(i))>threshold)
|
|
{
|
|
boolean overlap=false;
|
|
for(int j=0;j<GNormPlus.BioCDocobj.Annotations.get(countPMID).get(countPassage).size();j++)
|
|
{
|
|
String GetData[]=GNormPlus.BioCDocobj.Annotations.get(countPMID).get(countPassage).get(j).split("\t");
|
|
int startj=Integer.parseInt(GetData[0]);
|
|
int lastj=Integer.parseInt(GetData[1]);
|
|
String Mention_tmp = Mention.replaceAll("([^A-Za-z0-9@ ])", "\\\\$1");
|
|
if(MentionType.equals("Gene") && Double.parseDouble(outputArr3_score.get(i))>threshold_GeneType && GNormPlus.BioCDocobj.Annotations.get(countPMID).get(countPassage).get(j).matches(start+"\t"+last+"\t"+Mention_tmp+"\t(FamilyName|DomainMotif)") )
|
|
{
|
|
GNormPlus.BioCDocobj.Annotations.get(countPMID).get(countPassage).set(j, start+"\t"+last+"\t"+Mention+"\t"+MentionType);
|
|
}
|
|
else if( (start>=startj && start<lastj) || (last>startj && last<=lastj) )
|
|
{
|
|
overlap=true;
|
|
}
|
|
}
|
|
if(overlap == false)
|
|
{
|
|
GNormPlus.BioCDocobj.Annotations.get(countPMID).get(countPassage).add(start+"\t"+last+"\t"+Mention+"\t"+MentionType);
|
|
}
|
|
}
|
|
}
|
|
i--;
|
|
}
|
|
|
|
paragraph_num_last=paragraph_num;
|
|
pmid_last=pmid;
|
|
}
|
|
|
|
|
|
}
|
|
|
|
public void PostProcessing(String Filename,String FilenameBioC) throws XMLStreamException, IOException
|
|
{
|
|
|
|
String Disease_Suffix="disease|diseases|syndrome|syndromes|tumor|tumour|deficiency|dysgenesis|atrophy|frame|dystrophy";
|
|
String Cell_Suffix="cell|cells";
|
|
String FamilyName_Suffix="disease|diseases|syndrome|syndromes|tumor|tumour|deficiency|dysgenesis|atrophy|frame|dystrophy|frame|factors|family|families|superfamily|superfamilies|subfamily|subfamilies|complex|genes|proteins";
|
|
String DomainMotif_Suffix="domain|motif|domains|motifs|sequences";
|
|
String Strain_Suffix="alpha|beta|gamma|kappa|theta|delta|[A-Ga-g0-9]";
|
|
ArrayList<String> Translate2Family = new ArrayList<String>();
|
|
|
|
for(int i=0;i<GNormPlus.BioCDocobj.Annotations.size();i++)
|
|
{
|
|
|
|
HashMap<String, String> Mention2Type_Hash = new HashMap<String, String>();
|
|
ArrayList<String> GeneMentionPattern = new ArrayList<String>();
|
|
HashMap<String, Integer> MentionType2Num = new HashMap<String, Integer>();
|
|
if(GNormPlus.BioCDocobj.PMIDs.size()>=i)
|
|
{
|
|
String pmid=GNormPlus.BioCDocobj.PMIDs.get(i);
|
|
for(int j=0;j<GNormPlus.BioCDocobj.Annotations.get(i).size();j++)
|
|
{
|
|
for(int k=0;k<GNormPlus.BioCDocobj.Annotations.get(i).get(j).size();k++)
|
|
{
|
|
String Anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\\t");
|
|
String start = Anno[0];
|
|
String last = Anno[1];
|
|
String mention = Anno[2];
|
|
String type = Anno[3];
|
|
Mention2Type_Hash.put(mention.toLowerCase(),type);
|
|
if(MentionType2Num.containsKey(mention+"\t"+type))
|
|
{
|
|
MentionType2Num.put(mention.toLowerCase()+"\t"+type,MentionType2Num.get(mention+"\t"+type)+1);
|
|
if( GNormPlus.PmidLF2Abb_lc_hash.containsKey(pmid+"\t"+mention.toLowerCase()) )
|
|
{
|
|
MentionType2Num.put(GNormPlus.PmidLF2Abb_lc_hash.get(pmid+"\t"+mention.toLowerCase())+"\t"+type,MentionType2Num.get(mention+"\t"+type)+1);
|
|
}
|
|
else
|
|
{
|
|
MentionType2Num.put(GNormPlus.PmidLF2Abb_lc_hash.get(pmid+"\t"+mention.toLowerCase())+"\t"+type,1);
|
|
}
|
|
if( GNormPlus.PmidAbb2LF_lc_hash.containsKey(pmid+"\t"+mention.toLowerCase()) )
|
|
{
|
|
MentionType2Num.put(GNormPlus.PmidAbb2LF_lc_hash.get(pmid+"\t"+mention.toLowerCase())+"\t"+type,MentionType2Num.get(mention+"\t"+type)+1);
|
|
}
|
|
else
|
|
{
|
|
MentionType2Num.put(GNormPlus.PmidAbb2LF_lc_hash.get(pmid+"\t"+mention.toLowerCase())+"\t"+type,1);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
MentionType2Num.put(mention.toLowerCase()+"\t"+type,1);
|
|
if( GNormPlus.PmidLF2Abb_lc_hash.containsKey(pmid+"\t"+mention.toLowerCase()) )
|
|
{
|
|
MentionType2Num.put(GNormPlus.PmidLF2Abb_lc_hash.get(pmid+"\t"+mention.toLowerCase())+"\t"+type,1);
|
|
}
|
|
if( GNormPlus.PmidAbb2LF_lc_hash.containsKey(pmid+"\t"+mention.toLowerCase()) )
|
|
{
|
|
MentionType2Num.put(GNormPlus.PmidAbb2LF_lc_hash.get(pmid+"\t"+mention.toLowerCase())+"\t"+type,1);
|
|
}
|
|
}
|
|
if(Anno[3].equals("Gene"))
|
|
{
|
|
String mentmp=mention.toLowerCase();
|
|
if(mentmp.matches(".*[0-9].*") || mentmp.matches(".*(alpha|beta|gamma|theta|zeta|delta).*"))
|
|
{
|
|
if(!mentmp.matches(".*\\{(alpha|beta|gamma|theta|zeta|delta)\\}.*"))
|
|
{
|
|
mentmp=mentmp.replaceAll("([^A-Za-z0-9\\| ])", "\\\\$1");
|
|
mentmp=mentmp.replaceAll("[0-9]", "[0-9]");
|
|
mentmp=mentmp.replaceAll("(alpha|beta|gamma|theta|zeta|delta)", "(alpha\\|beta\\|gamma\\|theta\\|zeta\\|delta)");
|
|
if(!GeneMentionPattern.contains(mentmp))
|
|
{
|
|
GeneMentionPattern.add(mentmp);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
for(int j=0;j<GNormPlus.BioCDocobj.Annotations.get(i).size();j++)
|
|
{
|
|
ArrayList<Integer> RemoveList = new ArrayList<Integer>();
|
|
for(int k=0;k<GNormPlus.BioCDocobj.Annotations.get(i).get(j).size();k++)
|
|
{
|
|
String Anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\\t");
|
|
String start = Anno[0];
|
|
String last = Anno[1];
|
|
String mention = Anno[2];
|
|
String type = Anno[3];
|
|
String mention_tmp=mention.toLowerCase().replaceAll("([^A-Za-z0-9@ ])", "\\\\$1");
|
|
|
|
|
|
|
|
|
|
|
|
boolean SubSt=false;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if(SubSt == false)
|
|
{
|
|
int BoundaryLen=15;
|
|
if(GNormPlus.BioCDocobj.PassageContexts.get(i).get(j).length()<Integer.parseInt(last)+15)
|
|
{
|
|
BoundaryLen=GNormPlus.BioCDocobj.PassageContexts.get(i).get(j).length()-Integer.parseInt(last);
|
|
}
|
|
String SurroundingString="";
|
|
if(BoundaryLen<=0){}
|
|
else if (GNormPlus.BioCDocobj.PassageContexts.get(i).get(j).length()<Integer.parseInt(last)+BoundaryLen)
|
|
{
|
|
BoundaryLen=GNormPlus.BioCDocobj.PassageContexts.get(i).get(j).length()-Integer.parseInt(last)-1;
|
|
SurroundingString = GNormPlus.BioCDocobj.PassageContexts.get(i).get(j).substring(Integer.parseInt(last),Integer.parseInt(last)+BoundaryLen).toLowerCase();
|
|
}
|
|
else
|
|
{
|
|
SurroundingString = GNormPlus.BioCDocobj.PassageContexts.get(i).get(j).substring(Integer.parseInt(last),Integer.parseInt(last)+BoundaryLen).toLowerCase();
|
|
}
|
|
|
|
|
|
if( mention.toLowerCase().matches(".*("+Cell_Suffix+")") || SurroundingString.matches("("+Cell_Suffix+")") )
|
|
{
|
|
type="Cell";
|
|
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, start+"\t"+last+"\t"+mention+"\t"+type);
|
|
}
|
|
else if( mention.toLowerCase().matches(".*("+FamilyName_Suffix+")") || SurroundingString.matches("("+FamilyName_Suffix+")") )
|
|
{
|
|
type="FamilyName";
|
|
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, start+"\t"+last+"\t"+mention+"\t"+type);
|
|
}
|
|
else if( mention.toLowerCase().matches(".*("+DomainMotif_Suffix+")")|| SurroundingString.matches("("+DomainMotif_Suffix+")") )
|
|
{
|
|
type="DomainMotif";
|
|
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, start+"\t"+last+"\t"+mention+"\t"+type);
|
|
}
|
|
else if(!type.equals("Gene"))
|
|
{
|
|
|
|
double Num_FDC=0;
|
|
double Num_Gene=0;
|
|
if(MentionType2Num.containsKey(mention.toLowerCase()+"\tFamilyName"))
|
|
{
|
|
Num_FDC = Num_FDC + MentionType2Num.get(mention.toLowerCase()+"\tFamilyName");
|
|
}
|
|
if(MentionType2Num.containsKey(mention.toLowerCase()+"\tDomainMotif"))
|
|
{
|
|
Num_FDC = Num_FDC + MentionType2Num.get(mention.toLowerCase()+"\tDomainMotif");
|
|
}
|
|
if(MentionType2Num.containsKey(mention.toLowerCase()+"\tCell"))
|
|
{
|
|
Num_FDC = Num_FDC + MentionType2Num.get(mention.toLowerCase()+"\tCell");
|
|
}
|
|
if(MentionType2Num.containsKey(mention.toLowerCase()+"\tGene"))
|
|
{
|
|
Num_Gene = Num_Gene + MentionType2Num.get(mention.toLowerCase()+"\tGene");
|
|
}
|
|
if(Num_Gene/(Num_FDC+Num_Gene)>=0.5)
|
|
{
|
|
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, start+"\t"+last+"\t"+mention+"\tGene");
|
|
}
|
|
|
|
|
|
for(int p=0;p<GeneMentionPattern.size();p++)
|
|
{
|
|
if(mention.toLowerCase().matches(GeneMentionPattern.get(p)))
|
|
{
|
|
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, start+"\t"+last+"\t"+mention+"\tGene");
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
String lc_ment=mention.toLowerCase();
|
|
if(GNormPlus.PmidAbb2LF_lc_hash.containsKey(pmid+"\t"+lc_ment))
|
|
{
|
|
|
|
if(GNormPlus.PmidAbb2LF_lc_hash.get(pmid+"\t"+lc_ment).matches(".*("+Disease_Suffix+")"))
|
|
{
|
|
|
|
}
|
|
else if(GNormPlus.PmidAbb2LF_lc_hash.get(pmid+"\t"+lc_ment).matches(".*("+Cell_Suffix+")"))
|
|
{
|
|
|
|
}
|
|
else if(GNormPlus.PmidAbb2LF_lc_hash.get(pmid+"\t"+lc_ment).matches(".*("+FamilyName_Suffix+")") && !lc_ment.matches(".+[a-z][0-9][a-z]"))
|
|
{
|
|
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, start+"\t"+last+"\t"+mention+"\tFamilyName");
|
|
}
|
|
else if(GNormPlus.PmidAbb2LF_lc_hash.get(pmid+"\t"+lc_ment).matches(".*("+DomainMotif_Suffix+")"))
|
|
{
|
|
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, start+"\t"+last+"\t"+mention+"\tDomainMotif");
|
|
}
|
|
else
|
|
{
|
|
if(Mention2Type_Hash.containsKey(GNormPlus.PmidAbb2LF_lc_hash.get(pmid+"\t"+lc_ment))
|
|
&& Mention2Type_Hash.get(GNormPlus.PmidAbb2LF_lc_hash.get(pmid+"\t"+lc_ment)).equals("Gene")
|
|
&& !(type.equals("Gene"))
|
|
)
|
|
{
|
|
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, start+"\t"+last+"\t"+mention+"\tGene");
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
for(int j=0;j<GNormPlus.BioCDocobj.Annotations.get(i).size();j++)
|
|
{
|
|
for(int k=0;k<GNormPlus.BioCDocobj.Annotations.get(i).get(j).size();k++)
|
|
{
|
|
String Anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\\t");
|
|
if(Translate2Family.contains(Anno[2].toLowerCase()))
|
|
{
|
|
GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, Anno[0]+"\t"+Anno[1]+"\t"+Anno[2]+"\tFamilyName");
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
HashMap<String,String> GeneMentions = new HashMap<String,String>();
|
|
HashMap<String,String> GeneMentionLocationGNR = new HashMap<String,String>();
|
|
for(int j=0;j<GNormPlus.BioCDocobj.Annotations.get(i).size();j++)
|
|
{
|
|
for(int k=0;k<GNormPlus.BioCDocobj.Annotations.get(i).get(j).size();k++)
|
|
{
|
|
String Anno[] = GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\\t");
|
|
int start = Integer.parseInt(Anno[0]);
|
|
int last = Integer.parseInt(Anno[1]);
|
|
String mention = Anno[2];
|
|
String type = Anno[3];
|
|
for (int s=start ;s<=last;s++)
|
|
{
|
|
GeneMentionLocationGNR.put(j+"\t"+s,type);
|
|
}
|
|
GeneMentions.put(mention.toLowerCase(),type);
|
|
}
|
|
}
|
|
|
|
|
|
for(int j=0;j<GNormPlus.BioCDocobj.Annotations.get(i).size();j++)
|
|
{
|
|
if(GNormPlus.BioCDocobj.PassageContexts.size()>i && GNormPlus.BioCDocobj.PassageContexts.get(i).size()>j)
|
|
{
|
|
String PassageContexts = " " + GNormPlus.BioCDocobj.PassageContexts.get(i).get(j) + " ";
|
|
String PassageContexts_tmp = PassageContexts.toLowerCase();
|
|
for(String gm : GeneMentions.keySet())
|
|
{
|
|
String type=GeneMentions.get(gm);
|
|
if(type.equals("Gene"))
|
|
{
|
|
gm = gm.replaceAll("([\\W\\-\\_])", "\\\\$1");
|
|
gm=gm.replaceAll("[0-9]", "\\[0\\-9\\]");
|
|
gm=gm.replaceAll("(alpha|beta|gamma|theta|zeta|delta)", "(alpha\\|beta\\|gamma\\|theta\\|zeta\\|delta)");
|
|
gm=gm.replaceAll("\\-[a-z]$", "\\-\\[a\\-z\\]");
|
|
Pattern ptmp = Pattern.compile("^(.*[\\W\\-\\_])("+gm+")([\\W\\-\\_].*)$");
|
|
Matcher mtmp = ptmp.matcher(PassageContexts_tmp);
|
|
while(mtmp.find())
|
|
{
|
|
String pre = mtmp.group(1);
|
|
String gmtmp = mtmp.group(2);
|
|
String post = mtmp.group(3);
|
|
|
|
int start = pre.length()-1;
|
|
int last = start+gmtmp.length();
|
|
if(PassageContexts.length()>last)
|
|
{
|
|
String mention = PassageContexts.substring(start+1,last+1);
|
|
if(!GeneMentionLocationGNR.containsKey(j+"\t"+start) && !GeneMentionLocationGNR.containsKey(j+"\t"+last))
|
|
{
|
|
if(GNormPlus.BioCDocobj.Annotations.get(i).get(j).contains(start+"\t"+last+"\t"+mention+"\tFamilyName"))
|
|
{
|
|
GNormPlus.BioCDocobj.Annotations.get(i).get(j).remove(start+"\t"+last+"\t"+mention+"\tFamilyName");
|
|
}
|
|
else if(GNormPlus.BioCDocobj.Annotations.get(i).get(j).contains(start+"\t"+last+"\t"+mention+"\tDomainMotif"))
|
|
{
|
|
GNormPlus.BioCDocobj.Annotations.get(i).get(j).remove(start+"\t"+last+"\t"+mention+"\tDomainMotif");
|
|
}
|
|
GNormPlus.BioCDocobj.Annotations.get(i).get(j).add(start+"\t"+last+"\t"+mention+"\tGene");
|
|
}
|
|
gmtmp = gmtmp.replaceAll(".", "X");
|
|
PassageContexts_tmp=pre+""+gmtmp+""+post;
|
|
mtmp = ptmp.matcher(PassageContexts_tmp);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
for(int j=0;j<GNormPlus.BioCDocobj.Annotations.get(i).size();j++)
|
|
{
|
|
if(GNormPlus.BioCDocobj.PassageContexts.size()>i && GNormPlus.BioCDocobj.PassageContexts.get(i).size()>j)
|
|
{
|
|
String PassageContexts = " " + GNormPlus.BioCDocobj.PassageContexts.get(i).get(j) + " ";
|
|
String PassageContexts_tmp = PassageContexts.toLowerCase();
|
|
for(String gm : GeneMentions.keySet())
|
|
{
|
|
String type=GeneMentions.get(gm);
|
|
if(type.matches("(FamilyName|DomainMotif)"))
|
|
{
|
|
gm = gm.replaceAll("([\\W\\-\\_])", "\\\\$1");
|
|
gm=gm.replaceAll("s$", "(s\\|)");
|
|
Pattern ptmp = Pattern.compile("^(.*[\\W\\-\\_])("+gm+")([\\W\\-\\_].*)$");
|
|
Matcher mtmp = ptmp.matcher(PassageContexts_tmp);
|
|
while(mtmp.find())
|
|
{
|
|
String pre = mtmp.group(1);
|
|
String gmtmp = mtmp.group(2);
|
|
String post = mtmp.group(3);
|
|
|
|
int start = pre.length()-1;
|
|
int last = start+gmtmp.length();
|
|
if(PassageContexts.length()>last)
|
|
{
|
|
String mention = PassageContexts.substring(start+1,last+1);
|
|
if(!GeneMentionLocationGNR.containsKey(j+"\t"+start) && !GeneMentionLocationGNR.containsKey(j+"\t"+last))
|
|
{
|
|
if(!GNormPlus.BioCDocobj.Annotations.get(i).get(j).contains(start+"\t"+last+"\t"+mention+"\tGene"))
|
|
{
|
|
GNormPlus.BioCDocobj.Annotations.get(i).get(j).add(start+"\t"+last+"\t"+mention+"\t"+type);
|
|
}
|
|
}
|
|
gmtmp = gmtmp.replaceAll(".", "X");
|
|
PassageContexts_tmp=pre+""+gmtmp+""+post;
|
|
mtmp = ptmp.matcher(PassageContexts_tmp);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
GNormPlus.BioCDocobj.BioCOutput(Filename,FilenameBioC,GNormPlus.BioCDocobj.Annotations,false,false);
|
|
}
|
|
}
|
|
|
|
|
|
|