diff --git "a/src_Java/GNormPluslib/GNR.java" "b/src_Java/GNormPluslib/GNR.java" --- "a/src_Java/GNormPluslib/GNR.java" +++ "b/src_Java/GNormPluslib/GNR.java" @@ -1,1602 +1,1602 @@ -/** - * Project: GNormPlus - * Function: Gene Name Recognition - */ - -package GNormPluslib; - -import java.io.*; -import java.util.*; -import java.util.regex.Matcher; -import java.util.regex.Pattern; -import javax.xml.stream.XMLStreamException; - -import org.tartarus.snowball.SnowballStemmer; -import org.tartarus.snowball.ext.englishStemmer; - -import GNormPluslib.GNormPlus; -import GNormPluslib.BioCDoc; - -public class GNR -{ - /* - * Read BioC files - */ - public void Ab3P(String Filename,String FilenameAbb,String TrainTest) throws XMLStreamException,IOException - { - /** Abbreviation*/ - //BioC -> Abb input - String line=""; - BufferedWriter FileAbb = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(FilenameAbb), "UTF-8")); - for (int i = 0; i < GNormPlus.BioCDocobj.PMIDs.size(); i++) - { - String Pmid = GNormPlus.BioCDocobj.PMIDs.get(i); - String Context=""; - for (int j = 0; j < GNormPlus.BioCDocobj.PassageNames.get(i).size(); j++) - { - String PassageContext=GNormPlus.BioCDocobj.PassageContexts.get(i).get(j); - if(PassageContext.matches(".*\\([^\\(\\)]+,[^\\(\\)]+\\).*")) - { - PassageContext=PassageContext.replaceAll("\\([^\\(\\)]+,[^\\(\\)]+\\)", ""); - } - if(PassageContext.contains("\\(")) - { - Context = Context+PassageContext+" "; - } - } - FileAbb.write(Pmid+"\n"+Context+"\n\n"); - } - FileAbb.close(); - //Abb - File f = new File(FilenameAbb+".out"); - BufferedWriter fr = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(f), "UTF-8")); - Runtime runtime = Runtime.getRuntime(); - String cmd ="./Ab3P "+FilenameAbb+".Abb "+FilenameAbb+".out"; - - String OS=System.getProperty("os.name").toLowerCase(); - if(OS.contains("windows")) - { - cmd ="java -jar bioadi.jar "+FilenameAbb; - } - else //if(OS.contains("nux")||OS.contains("nix")) - { - cmd ="./Ab3P "+FilenameAbb+" "+FilenameAbb+".out"; - //cmd ="java -jar bioadi.jar "+FilenameAbb+" > "+FilenameAbb+".out"; - } - - Process process = runtime.exec(cmd); - InputStream is = process.getInputStream(); - InputStreamReader isr = new InputStreamReader(is, "UTF-8"); - BufferedReader br = new BufferedReader(isr); - line=""; - while ( (line = br.readLine()) != null) - { - fr.write(line); - fr.newLine(); - fr.flush(); - } - is.close(); - isr.close(); - br.close(); - fr.close(); - //Abb output -> Hash - BufferedReader inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(FilenameAbb+".out"), "UTF-8")); - line=""; - String pmid=""; - while ((line = inputfile.readLine()) != null) - { - String patt="^ (.+)\\|(.+)\\|([0-9\\.]+)$"; - Pattern ptmp = Pattern.compile(patt); - Matcher mtmp = ptmp.matcher(line); - if(line.matches("^[0-9]+$")) - { - pmid=line; - } - if(mtmp.find()) - { - String SF = mtmp.group(1); - String LF = mtmp.group(2); - double weight= Double.parseDouble(mtmp.group(3)); - GNormPlus.Pmid2Abb_hash.put(pmid+"\t"+SF, "Abb:SF"); - GNormPlus.Pmid2Abb_hash.put(pmid+"\t"+LF, "Abb:LF"); - GNormPlus.PmidLF2Abb_lc_hash.put(pmid+"\t"+LF.toLowerCase(), SF.toLowerCase()); - GNormPlus.PmidAbb2LF_lc_hash.put(pmid+"\t"+SF.toLowerCase(), LF.toLowerCase()); - GNormPlus.PmidAbb2LF_hash.put(pmid+"\t"+SF, LF); - if(weight >= 0.9) - { - GNormPlus.PmidLF2Abb_hash.put(pmid+"\t"+LF, SF); - } - } - } - inputfile.close(); - } - - public void LoadInputFile(String Filename,String FilenameAbb,String TrainTest) throws XMLStreamException,IOException - { - /** Read BioC file */ - //if(TrainTest.equals("Train")) - //{ - GNormPlus.BioCDocobj.BioCReaderWithAnnotation(Filename); - //} - //else - //{ - // GNormPlus.BioCDocobj.BioCReader(Filename); - //} - - - /** Abbreviation*/ - //BioC -> Abb input - String line=""; - BufferedWriter FileAbb = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(FilenameAbb), "UTF-8")); - for (int i = 0; i < GNormPlus.BioCDocobj.PMIDs.size(); i++) - { - String Pmid = GNormPlus.BioCDocobj.PMIDs.get(i); - String Context="Text:"; - for (int j = 0; j < GNormPlus.BioCDocobj.PassageNames.get(i).size(); j++) - { - String PassageContext=GNormPlus.BioCDocobj.PassageContexts.get(i).get(j); - if(PassageContext.matches(".*\\([^\\(\\)]+,[^\\(\\)]+\\).*")) - { - PassageContext=PassageContext.replaceAll("\\([^\\(\\)]+,[^\\(\\)]+\\)", ""); - } - if(PassageContext.contains("(")) - { - Context = Context+PassageContext+" "; - } - } - FileAbb.write(Pmid+"\n"+Context+"\n\n"); - } - FileAbb.close(); - //Abb - File f = new File(FilenameAbb+".out"); - BufferedWriter fr = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(f), "UTF-8")); - Runtime runtime = Runtime.getRuntime(); - String cmd ="./Ab3P "+FilenameAbb+".Abb "+FilenameAbb+".out"; - - String OS=System.getProperty("os.name").toLowerCase(); - if(OS.contains("windows")) - { - cmd ="java -jar bioadi.jar "+FilenameAbb; - } - else //if(OS.contains("nux")||OS.contains("nix")) - { - cmd ="./Ab3P "+FilenameAbb+" "+FilenameAbb+".out"; - //cmd ="java -jar bioadi.jar "+FilenameAbb+" > "+FilenameAbb+".out"; - } - - Process process = runtime.exec(cmd); - InputStream is = process.getInputStream(); - InputStreamReader isr = new InputStreamReader(is, "UTF-8"); - BufferedReader br = new BufferedReader(isr); - line=""; - while ( (line = br.readLine()) != null) - { - fr.write(line); - fr.newLine(); - fr.flush(); - } - is.close(); - isr.close(); - br.close(); - fr.close(); - //Abb output -> Hash - BufferedReader inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(FilenameAbb+".out"), "UTF-8")); - line=""; - String pmid=""; - while ((line = inputfile.readLine()) != null) - { - String patt="^ (.+)\\|(.+)\\|([0-9\\.]+)$"; - Pattern ptmp = Pattern.compile(patt); - Matcher mtmp = ptmp.matcher(line); - if(line.matches("^[0-9]+$")) - { - pmid=line; - } - if(mtmp.find()) - { - String SF = mtmp.group(1); - String LF = mtmp.group(2); - double weight= Double.parseDouble(mtmp.group(3)); - GNormPlus.Pmid2Abb_hash.put(pmid+"\t"+SF, "Abb:SF"); - GNormPlus.Pmid2Abb_hash.put(pmid+"\t"+LF, "Abb:LF"); - GNormPlus.PmidLF2Abb_lc_hash.put(pmid+"\t"+LF.toLowerCase(), SF.toLowerCase()); - GNormPlus.PmidAbb2LF_lc_hash.put(pmid+"\t"+SF.toLowerCase(), LF.toLowerCase()); - GNormPlus.PmidAbb2LF_hash.put(pmid+"\t"+SF, LF); - if(weight >= 0.9) - { - GNormPlus.PmidLF2Abb_hash.put(pmid+"\t"+LF, SF); - } - } - } - inputfile.close(); - } - - /* - * Feature Extraction - */ - public void FeatureExtraction(String FilenameData,String FilenameLoca,String TrainTest) throws XMLStreamException - { - try - { - /** output files */ - BufferedWriter FileLocation = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(FilenameLoca), "UTF-8")); // .location - BufferedWriter FileData = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(FilenameData), "UTF-8")); // .data - //NLP modules - SnowballStemmer stemmer = new englishStemmer(); - /** PMIDs : i */ - for (int i = 0; i < GNormPlus.BioCDocobj.PMIDs.size(); i++) - { - String Pmid = GNormPlus.BioCDocobj.PMIDs.get(i); - - /** Paragraphs : j */ - for (int j = 0; j < GNormPlus.BioCDocobj.PassageNames.get(i).size(); j++) - { - String PassageName= GNormPlus.BioCDocobj.PassageNames.get(i).get(j); // Passage name - int PassageOffset = GNormPlus.BioCDocobj.PassageOffsets.get(i).get(j); // Passage offset - String PassageContext = GNormPlus.BioCDocobj.PassageContexts.get(i).get(j); // Passage context - ArrayList Annotation = GNormPlus.BioCDocobj.Annotations.get(i).get(j); // Annotation - HashMap CTDGene_hash = new HashMap(); - HashMap FamilyName_hash = new HashMap(); - HashMap character_hash = new HashMap(); - HashMap Abbreviation_hash = new HashMap(); - String PassageContext_tmp=" "+PassageContext+" "; - - /** Abbreviation */ - HashMap Abb_sortebylength = new HashMap(); - ArrayList length_list = new ArrayList(); - int countn=0; - for (Object key : GNormPlus.Pmid2Abb_hash.keySet()) - { - String pmid2abb[]=key.toString().split("\t"); - if(Pmid.equals(pmid2abb[0])) - { - Abb_sortebylength.put(pmid2abb[1].length()*100+countn, pmid2abb[1]); - length_list.add(pmid2abb[1].length()*100+countn); - countn++; - } - } - Collections.sort(length_list); - for (int l=length_list.size()-1;l>=0;l--) - { - String AbbLF = Abb_sortebylength.get(length_list.get(l)); - AbbLF=AbbLF.replaceAll("([^A-Za-z0-9@ ])", "\\\\$1"); - AbbLF=AbbLF.replaceAll(" ", "\\[ \\]\\+"); - Pattern ptmp = Pattern.compile("^(.*[^A-Za-z0-9]+)("+AbbLF+")([^A-Za-z0-9]+.*)$"); - Matcher mtmp = ptmp.matcher(PassageContext_tmp); - while(mtmp.find()) - { - String str1=mtmp.group(1); - String str2=mtmp.group(2); - String str3=mtmp.group(3); - for(int m=str1.length();m<=(str1.length()+str2.length());m++) - { - Abbreviation_hash.put((m-1),GNormPlus.Pmid2Abb_hash.get(Pmid+"\t"+Abb_sortebylength.get(length_list.get(l)))); - } - String men=""; - for(int m=0;m locations = GNormPlus.PT_CTDGene.SearchMentionLocation(PassageContext,"CTDGene"); - for (int k = 0 ; k < locations.size() ; k++) - { - String anno[]=locations.get(k).split("\t"); - int start= Integer.parseInt(anno[0]) + PassageOffset; - int last= Integer.parseInt(anno[1]) + PassageOffset; - String mention = anno[2]; - String id = anno[3]; - - CTDGene_hash.put(start,"CTDGene_B"); - CTDGene_hash.put(last,"CTDGene_E"); - for(int s=start+1;s locations_Fname = GNormPlus.PT_FamilyName.SearchMentionLocation(PassageContext,"FamilyName"); - for (int k = 0 ; k < locations_Fname.size() ; k++) - { - String anno[]=locations_Fname.get(k).split("\t"); - int start= Integer.parseInt(anno[0]) + PassageOffset; - int last= Integer.parseInt(anno[1]) + PassageOffset; - String mention = anno[2]; - String id = anno[3]; - - if(!CTDGene_hash.containsKey(start)) - { - FamilyName_hash.put(start,"famplex_B"); - FamilyName_hash.put(last,"famplex_E"); - for(int s=start+1;stokens[p].length() && PassageContext_tmp.substring(tokens[p].length(),tokens[p].length()+1).equals(" ")) - { - WSF="WSF:Gap"; - } - if(p==0) - { - WSB="WSB:1st"; - } - else if(p==tokens.length-1) - { - WSF="WSF:last"; - } - - if(PassageContext_tmp.substring(0,tokens[p].length()).equals(tokens[p])) - { - if(tokens[p].length()>0) - { - /* - * .loca - */ - int start=Offset; - int last=Offset+tokens[p].length(); - String State=""; - if(!character_hash.containsKey(start) || !character_hash.containsKey(last)){} - else if(character_hash.get(start).matches(".*B$")) - { - State=character_hash.get(start); - } - else if(character_hash.get(last).matches(".*E$")) - { - State=character_hash.get(last); - } - else if(character_hash.get(start).matches(".*I$")) - { - State=character_hash.get(start); - } - - if((!tokens[p].equals("\t"))) - { - FileLocation.write(Pmid+"\t"+PassageName+"\t"+j+"\t"+tokens[p]+"\t"+(Offset+1)+"\t"+(Offset+tokens[p].length())+"\t"+State+"\n"); - } - - /* - * .data - */ - - //Abbreviation - String Abb_State="__nil__"; - if(!Abbreviation_hash.containsKey(start) || !Abbreviation_hash.containsKey(last)){Abb_State="__nil__";} - else if(Abbreviation_hash.containsKey(start)) - { - Abb_State=Abbreviation_hash.get(start); - } - - //CTDGene - start=PassageOffset+Offset; - last=PassageOffset+Offset+tokens[p].length(); - String CTDGene_State="__nil__"; - if(!CTDGene_hash.containsKey(start) || !CTDGene_hash.containsKey(last)){CTDGene_State="__nil__";} - else if(CTDGene_hash.get(start).matches(".*B$")) - { - CTDGene_State=CTDGene_hash.get(start); - } - else if(CTDGene_hash.get(last).matches(".*E$")) - { - CTDGene_State=CTDGene_hash.get(last); - } - else if(CTDGene_hash.get(start).matches(".*I$")) - { - CTDGene_State=CTDGene_hash.get(start); - } - - //FamilyName - if(CTDGene_State.equals("__nil__")) - { - start=PassageOffset+Offset; - last=PassageOffset+Offset+tokens[p].length(); - if(!FamilyName_hash.containsKey(start) || !FamilyName_hash.containsKey(last)){} - else if(FamilyName_hash.get(start).matches(".*B$")) - { - CTDGene_State=FamilyName_hash.get(start); - } - else if(FamilyName_hash.get(last).matches(".*E$")) - { - CTDGene_State=FamilyName_hash.get(last); - } - else if(FamilyName_hash.get(start).matches(".*I$")) - { - CTDGene_State=FamilyName_hash.get(start); - } - } - - //stemming - stemmer.setCurrent(tokens[p].toLowerCase()); - stemmer.stem(); - String stem=stemmer.getCurrent(); - - //Number of Numbers [0-9] - String Num_num=""; - String tmp=tokens[p]; - tmp=tmp.replaceAll("[^0-9]",""); - if(tmp.length()>3){Num_num="N:4+";}else{Num_num="N:"+ tmp.length();} - - //Number of Uppercase [A-Z] - String Num_Uc=""; - tmp=tokens[p]; - tmp=tmp.replaceAll("[^A-Z]",""); - if(tmp.length()>3){Num_Uc="U:4+";}else{Num_Uc="U:"+ tmp.length();} - - //Number of Lowercase [a-z] - String Num_lc=""; - tmp=tokens[p]; - tmp=tmp.replaceAll("[^a-z]",""); - if(tmp.length()>3){Num_lc="L:4+";}else{Num_lc="L:"+ tmp.length();} - - //Number of ALL char - String Num_All=""; - if(tokens[p].length()>3){Num_All="A:4+";}else{Num_All="A:"+ tokens[p].length();} - - //specific character (;:,.->+_) - String SpecificC="__nil__"; - if(tokens[p].equals(";") || tokens[p].equals(":") || tokens[p].equals(",") || tokens[p].equals(".") || tokens[p].equals("-") || tokens[p].equals(">") || tokens[p].equals("+") || tokens[p].equals("_")) - { - SpecificC="-SpecificC1-"; - } - else if(tokens[p].equals("(") || tokens[p].equals(")")) - { - SpecificC="-SpecificC2-"; - } - else if(tokens[p].equals("{") || tokens[p].equals("}")) - { - SpecificC="-SpecificC3-"; - } - else if(tokens[p].equals("[") || tokens[p].equals("]")) - { - SpecificC="-SpecificC4-"; - } - else if(tokens[p].equals("\\") || tokens[p].equals("/")) - { - SpecificC="-SpecificC5-"; - } - - //Chemical Prefix/Suffix - String ChemPreSuf="__nil__"; - if(tokens[p].matches(".*(yl|ylidyne|oyl|sulfonyl)")){ChemPreSuf="-CHEMinlineSuffix-";} - else if(tokens[p].matches("(meth|eth|prop|tetracos).*")){ChemPreSuf="-CHEMalkaneStem-";} - else if(tokens[p].matches("(di|tri|tetra).*")){ChemPreSuf="-CHEMsimpleMultiplier-";} - else if(tokens[p].matches("(benzen|pyridin|toluen).*")){ChemPreSuf="-CHEMtrivialRing-";} - else if(tokens[p].matches(".*(one|ol|carboxylic|amide|ate|acid|ium|ylium|ide|uide|iran|olan|inan|pyrid|acrid|amid|keten|formazan|fydrazin)(s|)")){ChemPreSuf="-CHEMsuffix-";} - - - //Mention Type - String MentionType="__nil__"; - /* - if($tmp eq "to" && $CTD_result_hash{$count_token-1} eq "CTD_gene" && $CTD_result_hash{$count_token+1} eq "CTD_gene"){$CTD_result_hash{$count_token}="CTD_gene";} - if($tmp=~/^(or|and|,)$/ && $CTD_result_hash{$count_token-1} eq "CTD_gene" && $CTD_result_hash{$count_token+1} eq "CTD_gene"){$MentionType="-Type_GeneConjunction-";} - elsif($tmp=~/^(or|and|,)$/ && $last_token=~/^(or|and|,)$/ && $CTD_result_hash{$count_token-2} eq "CTD_gene" && $CTD_result_hash{$count_token+1} eq "CTD_gene"){$MentionType="-Type_GeneConjunction-";} - elsif($tmp=~/^(or|and|,)$/ && $next_token=~/^(or|and|,)$/ && $CTD_result_hash{$count_token-1} eq "CTD_gene" && $CTD_result_hash{$count_token+2} eq "CTD_gene"){$MentionType="-Type_GeneConjunction-";} - */ - if(tokens[p].matches("(ytochrome|cytochrome)")){MentionType="-Type_cytochrome-";} - else if(tokens[p].matches(".*target") ){MentionType="-Type_target-";} - else if(tokens[p].matches(".*(irradiation|hybrid|fusion|experiment|gst|est|gap|antigen)") ){MentionType="-Type_ExperimentNoun-";} - else if(tokens[p].matches(".*(disease|disorder|dystrophy|deficiency|syndrome|dysgenesis|cancer|injury|neoplasm|diabetes|diabete)") ){MentionType="-Type_Disease-";} - else if(tokens[p].matches(".*(motif|domain|omain|binding|site|region|sequence|frameshift|finger|box).*") ){MentionType="-Type_DomainMotif-";} - else if(tokens[p].equals("-") && (p0 && tokens[p-1].matches("^[0-9]+$")) ) ){MentionType="-Type_ChromosomeStrain-";} - else if(tokens[p].matches(".*(related|regulated|associated|correlated|reactive).*")){MentionType="-Type_relation-";} - else if(tokens[p].toLowerCase().matches(".*(polymorphism|mutation|deletion|insertion|duplication|genotype|genotypes).*") ){MentionType="-Type_VariationTerms-";} - else if(tokens[p].matches(".*(oxidase|transferase|transferases|kinase|kinese|subunit|unit|receptor|adrenoceptor|transporter|regulator|transcription|antigen|protein|gene|factor|member|molecule|channel|deaminase|spectrin).*") ){MentionType="-Type_suffix-";} - else if(tokens[p].matches("[\\(\\-\\_]") && (p=1){ prefix=tmp.substring(0, 1);}else{prefix="__nil__";} - if(tmp.length()>=2){ prefix=prefix+" "+tmp.substring(0, 2);}else{prefix=prefix+" __nil__";} - if(tmp.length()>=3){ prefix=prefix+" "+tmp.substring(0, 3);}else{prefix=prefix+" __nil__";} - if(tmp.length()>=4){ prefix=prefix+" "+tmp.substring(0, 4);}else{prefix=prefix+" __nil__";} - if(tmp.length()>=5){ prefix=prefix+" "+tmp.substring(0, 5);}else{prefix=prefix+" __nil__";} - - - //suffix - String suffix=""; - tmp=tokens[p]; - if(tmp.length()>=1){ suffix=tmp.substring(tmp.length()-1, tmp.length());}else{suffix="__nil__";} - if(tmp.length()>=2){ suffix=suffix+" "+tmp.substring(tmp.length()-2, tmp.length());}else{suffix=suffix+" __nil__";} - if(tmp.length()>=3){ suffix=suffix+" "+tmp.substring(tmp.length()-3, tmp.length());}else{suffix=suffix+" __nil__";} - if(tmp.length()>=4){ suffix=suffix+" "+tmp.substring(tmp.length()-4, tmp.length());}else{suffix=suffix+" __nil__";} - if(tmp.length()>=5){ suffix=suffix+" "+tmp.substring(tmp.length()-5, tmp.length());}else{suffix=suffix+" __nil__";} - - if(State.equals("")) - { - State="O"; - } - - if((!tokens[p].equals("\t"))) - { - if(TrainTest.equals("Train")) - { - FileData.write(tokens[p]+" "+stem+" "+WSB+" "+WSF+" "+Num_num+" "+Num_Uc+" "+Num_lc+" "+Num_All+" "+SpecificC+" "+ChemPreSuf+" "+MentionType+" "+ProteinSym+" "+prefix+" "+suffix+" "+CTDGene_State+" "+Abb_State+" "+State+"\n"); - } - else - { - FileData.write(tokens[p]+" "+stem+" "+WSB+" "+WSF+" "+Num_num+" "+Num_Uc+" "+Num_lc+" "+Num_All+" "+SpecificC+" "+ChemPreSuf+" "+MentionType+" "+ProteinSym+" "+prefix+" "+suffix+" "+CTDGene_State+" "+Abb_State+"\n"); - } - } - PassageContext_tmp=PassageContext_tmp.substring(tokens[p].length()); // remove the token for the context - Offset=Offset+tokens[p].length(); - } - } - } - if(tokens.length>0) - { - FileLocation.write("\n"); - FileData.write("\n"); - } - } - } - FileLocation.close(); - FileData.close(); - } - catch(IOException e1){ System.out.println("[MR]: Input file is not exist.");} - } - /* - * Testing by CRF++ - */ - public void CRF_test(String model, String FilenameData, String FilenameOutput) throws IOException - { - File f = new File(FilenameOutput); - BufferedWriter fr = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(f), "UTF-8")); - - Runtime runtime = Runtime.getRuntime(); - - String OS=System.getProperty("os.name").toLowerCase(); - - String cmd="./CRF/crf_test -m "+model+" -o "+FilenameOutput+" "+FilenameData; - if(OS.contains("windows")) - { - cmd ="CRF/crf_test -m "+model+" -o "+FilenameOutput+" "+FilenameData; - } - else //if(OS.contains("nux")||OS.contains("nix")) - { - cmd ="./CRF/crf_test -m "+model+" -o "+FilenameOutput+" "+FilenameData; - } - - try { - Process process = runtime.exec(cmd); - InputStream is = process.getInputStream(); - InputStreamReader isr = new InputStreamReader(is, "UTF-8"); - BufferedReader br = new BufferedReader(isr); - String line=""; - while ( (line = br.readLine()) != null) - { - fr.write(line); - fr.newLine(); - fr.flush(); - } - is.close(); - isr.close(); - br.close(); - fr.close(); - } - catch (IOException e) { - System.out.println(e); - runtime.exit(0); - } - } - - public void CRF_test(String model,String FilenameData,String FilenameOutput,String top3) throws IOException - { - File f = new File(FilenameOutput); - BufferedWriter fr = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(f), "UTF-8")); - - Runtime runtime = Runtime.getRuntime(); - - String OS=System.getProperty("os.name").toLowerCase(); - - String cmd="./CRF/crf_test -n 3 -m "+model+" -o "+FilenameOutput+" "+FilenameData; - if(OS.contains("windows")) - { - cmd ="CRF/crf_test -n 3 -m "+model+" -o "+FilenameOutput+" "+FilenameData; - } - else //if(OS.contains("nux")||OS.contains("nix")) - { - cmd ="./CRF/crf_test -n 3 -m "+model+" -o "+FilenameOutput+" "+FilenameData; - } - - try { - Process process = runtime.exec(cmd); - InputStream is = process.getInputStream(); - InputStreamReader isr = new InputStreamReader(is, "UTF-8"); - BufferedReader br = new BufferedReader(isr); - String line=""; - while ( (line = br.readLine()) != null) - { - fr.write(line); - fr.newLine(); - fr.flush(); - } - is.close(); - isr.close(); - br.close(); - fr.close(); - } - catch (IOException e) { - System.out.println(e); - runtime.exit(0); - } - } - - /* - * Learning model by CRF++ - */ - public void CRF_learn(String model, String FilenameData) throws IOException - { - Runtime runtime = Runtime.getRuntime(); - - Process process = null; - String line = null; - InputStream is = null; - InputStreamReader isr = null; - BufferedReader br = null; - - String OS=System.getProperty("os.name").toLowerCase(); - - String cmd="./CRF/crf_learn -f 3 -c 4.0 CRF/template_UB "+FilenameData+" "+model; - if(OS.contains("windows")) - { - cmd ="CRF/crf_learn -f 3 -c 4.0 CRF/template_UB "+FilenameData+" "+model; - } - else //if(OS.contains("nux")||OS.contains("nix")) - { - cmd ="./CRF/crf_learn -f 3 -c 4.0 CRF/template_UB "+FilenameData+" "+model; - } - - try { - process = runtime.exec(cmd); - is = process.getInputStream(); - isr = new InputStreamReader(is, "UTF-8"); - br = new BufferedReader(isr); - while ( (line = br.readLine()) != null) - { - System.out.println(line); - System.out.flush(); - } - is.close(); - isr.close(); - br.close(); - } - catch (IOException e) { - System.out.println(e); - runtime.exit(0); - } - } - - public void ReadCRFresult(String Filename,String FilenameLoca,String FilenameOutput,String FilenameBioC) throws XMLStreamException, IOException - { - /** load CRF output */ - ArrayList outputArr = new ArrayList(); - BufferedReader inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(FilenameOutput), "UTF-8")); - String line; - while ((line = inputfile.readLine()) != null) - { - outputArr.add(line); - } - inputfile.close(); - - /** load location */ - ArrayList locationArr = new ArrayList(); - inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(FilenameLoca), "UTF-8")); - while ((line = inputfile.readLine()) != null) - { - locationArr.add(line); - } - inputfile.close(); - - /** output -> mentions */ - String pmid_last=""; - String paragraph_num_last=""; - String pmid=""; - String paragraph=""; - String paragraph_num=""; - Pattern pat_B = Pattern.compile("((FamilyName|DomainMotif|Gene)_[B])$"); - Pattern pat_IE = Pattern.compile("((FamilyName|DomainMotif|Gene)_[IE])$"); - ArrayList> AnnotationInPMID = new ArrayList(); // array of Annotations in the PMIDs - ArrayList AnnotationInPassage= new ArrayList(); // array of Annotations in the Passage - GNormPlus.BioCDocobj.Annotations = new ArrayList(); - int countPMID=0; - int countPassage=0; - /** outputArr */ - for(int i=0;i3) - { - pmid=locationRow[0]; - paragraph=locationRow[1]; - paragraph_num=locationRow[2]; - } - - if( (!paragraph_num_last.equals("")) && (!paragraph_num.equals(paragraph_num_last)) ) - { - AnnotationInPMID.add(AnnotationInPassage); - AnnotationInPassage = new ArrayList(); - countPassage++; - } - if( (!pmid_last.equals("")) && (!pmid.equals(pmid_last)) ) - { - GNormPlus.BioCDocobj.Annotations.add(AnnotationInPMID); - AnnotationInPMID = new ArrayList(); - countPMID++; - countPassage=0; - } - - boolean F = false; //Flag of Finding - if(locationRow.length>2) - { - Matcher mat = pat_B.matcher(outputsRow[outputsRow.length-1]); // last column : Status - while(mat.find() && locationRow.length==6) - { - MentionType=mat.group(2); - pmid=locationRow[0]; - paragraph_num=locationRow[2]; - int start_tmp=Integer.parseInt(locationRow[4])-1; - int last_tmp=Integer.parseInt(locationRow[5]); - if(start_tmplast){last=last_tmp;} - i++; - F = true; - if(locationArr.get(i).length()>0) - { - outputsRow=outputArr.get(i).split("\\t"); - locationRow=locationArr.get(i).split("\\t"); - mat = pat_IE.matcher(outputsRow[outputsRow.length-1]); - } - else - { - break; - } - } - } - - if(F == true) - { - String PassageContext = GNormPlus.BioCDocobj.PassageContexts.get(countPMID).get(countPassage); // Passage context - String Mention = PassageContext.substring(start, last); - String Mention_nospace = Mention.replaceAll("[\\W\\-\\_]", ""); - if(Mention.toLowerCase().matches("(figure|tables|fig|tab|exp\\. [0-9]+).*")){} - else if(Mention.matches("[A-Z][A-Z]s")){} - else if(Mention.matches(".*\\|.*")){} - else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\;\\,\\'\\/\\\\].*")){} - else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\(].*") && !Mention.matches(".*[\\)].*")){} - else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\[].*") && !Mention.matches(".*[\\]].*")){} - else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\)].*") && !Mention.matches(".*[\\(].*")){} - else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\]].*") && !Mention.matches(".*[\\[].*")){} - else - { - AnnotationInPassage.add(start+"\t"+last+"\t"+Mention+"\t"+MentionType); - } - i--; - } - - paragraph_num_last=paragraph_num; - pmid_last=pmid; - }// outputArr1 - AnnotationInPMID.add(AnnotationInPassage); - GNormPlus.BioCDocobj.Annotations.add(AnnotationInPMID); - - //GNormPlus.BioCDocobj.BioCOutput(Filename,FilenameBioC,GNormPlus.BioCDocobj.Annotations,false,false); //save in BioC file - } - - public void ReadCRFresult(String Filename,String FilenameLoca,String FilenameOutput,String FilenameBioC,double threshold,double threshold_GeneType) throws XMLStreamException, IOException - { - /** load CRF output */ - ArrayList outputArr1 = new ArrayList(); - ArrayList outputArr2 = new ArrayList(); - ArrayList outputArr3 = new ArrayList(); - ArrayList outputArr1_score = new ArrayList(); - ArrayList outputArr2_score = new ArrayList(); - ArrayList outputArr3_score = new ArrayList(); - BufferedReader inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(FilenameOutput), "UTF-8")); - String line; - int rank=0; - String score=""; - Pattern pat_Rank = Pattern.compile("^# ([0-2]) ([0-9\\.]+)$"); - while ((line = inputfile.readLine()) != null) - { - Matcher mat = pat_Rank.matcher(line); // last column : Status - if(mat.find()) - { - rank = Integer.parseInt(mat.group(1)); - score = mat.group(2); - } - else if(rank == 0) - { - outputArr1.add(line); - outputArr1_score.add(score); - } - else if(rank == 1) - { - outputArr2.add(line); - outputArr2_score.add(score); - } - else if(rank == 2) - { - outputArr3.add(line); - outputArr3_score.add(score); - } - } - inputfile.close(); - - /** load location */ - ArrayList locationArr = new ArrayList(); - inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(FilenameLoca), "UTF-8")); - while ((line = inputfile.readLine()) != null) - { - locationArr.add(line); - } - inputfile.close(); - - /** output -> mentions */ - String pmid_last=""; - String paragraph_num_last=""; - String pmid=""; - String paragraph=""; - String paragraph_num=""; - Pattern pat_B = Pattern.compile("((FamilyName|DomainMotif|Gene)_[B])$"); - Pattern pat_IE = Pattern.compile("((FamilyName|DomainMotif|Gene)_[IE])$"); - ArrayList> AnnotationInPMID = new ArrayList(); // array of Annotations in the PMIDs - ArrayList AnnotationInPassage= new ArrayList(); // array of Annotations in the Passage - GNormPlus.BioCDocobj.Annotations = new ArrayList(); - int countPMID=0; - int countPassage=0; - /** outputArr1 */ - int size_Arr=outputArr1.size(); - if(locationArr.size()3) - { - pmid=locationRow[0]; - paragraph=locationRow[1]; - paragraph_num=locationRow[2]; - } - - boolean F = false; //Flag of Finding - if(outputsRow.length>=1) - { - Matcher mat = pat_B.matcher(outputsRow[outputsRow.length-1]); // last column : Status - while(mat.find() && locationRow.length==6) - { - MentionType=mat.group(2); - pmid=locationRow[0]; - int start_tmp=Integer.parseInt(locationRow[4])-1; - int last_tmp=Integer.parseInt(locationRow[5]); - if(start_tmplast){last=last_tmp;} - i++; - outputsRow=outputArr1.get(i).split("\\t"); - locationRow=locationArr.get(i).split("\\t"); - mat = pat_IE.matcher(outputsRow[outputsRow.length-1]); - F = true; - } - } - - if( (!paragraph_num_last.equals("")) && (!paragraph_num.equals(paragraph_num_last)) ) // paragraph change - { - AnnotationInPMID.add(AnnotationInPassage); - AnnotationInPassage = new ArrayList(); - countPassage++; - } - - if( !pmid.equals(pmid_last) && paragraph_num.equals("0") && paragraph_num_last.equals("0") ) // pmid change (special case : the article only has one paragrpah) - { - AnnotationInPMID.add(AnnotationInPassage); - AnnotationInPassage = new ArrayList(); - GNormPlus.BioCDocobj.Annotations.add(AnnotationInPMID); - AnnotationInPMID = new ArrayList(); - countPMID++; - countPassage=0; - } - else if( (!pmid_last.equals("")) && (!pmid.equals(pmid_last)) ) // pmid change - { - GNormPlus.BioCDocobj.Annotations.add(AnnotationInPMID); - AnnotationInPMID = new ArrayList(); - countPMID++; - countPassage=0; - } - - if(F == true) - { - if(GNormPlus.BioCDocobj.PassageContexts.size()>countPMID && GNormPlus.BioCDocobj.PassageContexts.get(countPMID).size()>countPassage && GNormPlus.BioCDocobj.PassageContexts.get(countPMID).get(countPassage).length()>=last && (last-start)<1000) - { - String PassageContext = GNormPlus.BioCDocobj.PassageContexts.get(countPMID).get(countPassage); // Passage context - String Mention = PassageContext.substring(start, last); - String Mention_nospace = Mention.replaceAll("[\\W\\-\\_]", ""); - if(Mention.toLowerCase().matches("(figure|tables|fig|tab|exp\\. [0-9]+).*")){} - else if(Mention.matches("[A-Z][A-Z]s")){} - else if(Mention.matches(".*\\|.*")){} - else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\;\\,\\'\\/\\\\].*")){} - else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\(].*") && !Mention.matches(".*[\\)].*")){} - else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\[].*") && !Mention.matches(".*[\\]].*")){} - else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\)].*") && !Mention.matches(".*[\\(].*")){} - else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\]].*") && !Mention.matches(".*[\\[].*")){} - else if((GNormPlus.Abb2Longformtok_hash.containsKey(Mention_nospace.toLowerCase())) && (PassageContext.toLowerCase().matches(".*[\\W\\-\\-]("+GNormPlus.Abb2Longformtok_hash.get(Mention_nospace.toLowerCase())+")[\\W\\-\\-].*"))) - { - //System.out.println(Mention_nospace.toLowerCase()+"\t"+GNormPlus.Abb2Longformtok_hash.get(Mention_nospace.toLowerCase())); - } - else - { - AnnotationInPassage.add(start+"\t"+last+"\t"+Mention+"\t"+MentionType); - } - } - i--; - } - paragraph_num_last=paragraph_num; - pmid_last=pmid; - }// outputArr1 - AnnotationInPMID.add(AnnotationInPassage); - GNormPlus.BioCDocobj.Annotations.add(AnnotationInPMID); - - /** outputArr2 */ - pmid_last=""; - paragraph_num_last=""; - pmid=""; - paragraph=""; - paragraph_num=""; - countPMID=0; - countPassage=0; - size_Arr=outputArr2.size(); - if(locationArr.size()2) - { - pmid=locationRow[0]; - paragraph=locationRow[1]; - paragraph_num=locationRow[2]; - } - - boolean F = false; //Flag of Finding - if(outputsRow.length>=1) - { - Matcher mat = pat_B.matcher(outputsRow[outputsRow.length-1]); // last column : Status - while(mat.find() && locationRow.length==6) - { - MentionType=mat.group(2); - pmid=locationRow[0]; - int start_tmp=Integer.parseInt(locationRow[4])-1; - int last_tmp=Integer.parseInt(locationRow[5]); - if(start_tmplast){last=last_tmp;} - i++; - outputsRow=outputArr2.get(i).split("\\t"); - locationRow=locationArr.get(i).split("\\t"); - mat = pat_IE.matcher(outputsRow[outputsRow.length-1]); - F = true; - } - } - - if( (!paragraph_num_last.equals("")) && (!paragraph_num.equals(paragraph_num_last)) ) // paragraph change - { - countPassage++; - } - - if( !pmid.equals(pmid_last) && paragraph_num.equals("0") && paragraph_num_last.equals("0") ) // pmid change (special case : the article only has one paragrpah) - { - countPMID++; - countPassage=0; - } - else if( (!pmid_last.equals("")) && (!pmid.equals(pmid_last)) ) // pmid change - { - countPMID++; - countPassage=0; - } - - if(F == true) - { - if(GNormPlus.BioCDocobj.PassageContexts.size()>countPMID && GNormPlus.BioCDocobj.PassageContexts.get(countPMID).size()>countPassage && GNormPlus.BioCDocobj.PassageContexts.get(countPMID).get(countPassage).length()>=last && (last-start)<1000) - { - String PassageContext = GNormPlus.BioCDocobj.PassageContexts.get(countPMID).get(countPassage); // Passage context - String Mention = PassageContext.substring(start, last); - String Mention_nospace = Mention.replaceAll("[\\W\\-\\_]", ""); - if(Mention.toLowerCase().matches("(figure|tables|fig|tab|exp\\. [0-9]+).*")){} - else if(Mention.matches("[A-Z][A-Z]s")){} - else if(Mention.matches(".*\\|.*")){} - else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\;\\,\\'\\/\\\\].*")){} - else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\(].*") && !Mention.matches(".*[\\)].*")){} - else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\[].*") && !Mention.matches(".*[\\]].*")){} - else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\)].*") && !Mention.matches(".*[\\(].*")){} - else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\]].*") && !Mention.matches(".*[\\[].*")){} - else if((GNormPlus.Abb2Longformtok_hash.containsKey(Mention_nospace.toLowerCase())) && (PassageContext.toLowerCase().matches(".*[\\W\\-\\-]("+GNormPlus.Abb2Longformtok_hash.get(Mention_nospace.toLowerCase())+")[\\W\\-\\-].*"))) - { - //System.out.println(Mention_nospace.toLowerCase()+"\t"+GNormPlus.Abb2Longformtok_hash.get(Mention_nospace.toLowerCase())); - } - else if(Double.parseDouble(outputArr2_score.get(i))>threshold) - { - boolean overlap=false; - for(int j=0;jthreshold_GeneType && GNormPlus.BioCDocobj.Annotations.get(countPMID).get(countPassage).get(j).matches(start+"\t"+last+"\t"+Mention_tmp+"\t(FamilyName|DomainMotif)") ) - { - GNormPlus.BioCDocobj.Annotations.get(countPMID).get(countPassage).set(j, start+"\t"+last+"\t"+Mention+"\t"+MentionType); - } - else if( (start>=startj && startstartj && last<=lastj) ) - { - overlap=true; - } - } - if(overlap == false) - { - GNormPlus.BioCDocobj.Annotations.get(countPMID).get(countPassage).add(start+"\t"+last+"\t"+Mention+"\t"+MentionType); - } - } - } - i--; - } - - paragraph_num_last=paragraph_num; - pmid_last=pmid; - }// outputArr2 - - /** outputArr3 */ - pmid_last=""; - paragraph_num_last=""; - pmid=""; - paragraph=""; - paragraph_num=""; - countPMID=0; - countPassage=0; - size_Arr=outputArr3.size(); - if(locationArr.size()2) - { - pmid=locationRow[0]; - paragraph=locationRow[1]; - paragraph_num=locationRow[2]; - } - - boolean F = false; //Flag of Finding - if(outputsRow.length>=1) - { - Matcher mat = pat_B.matcher(outputsRow[outputsRow.length-1]); // last column : Status - while(mat.find() && locationRow.length==6) - { - MentionType=mat.group(2); - pmid=locationRow[0]; - paragraph_num=locationRow[2]; - int start_tmp=Integer.parseInt(locationRow[4])-1; - int last_tmp=Integer.parseInt(locationRow[5]); - if(start_tmplast){last=last_tmp;} - i++; - outputsRow=outputArr3.get(i).split("\\t"); - locationRow=locationArr.get(i).split("\\t"); - mat = pat_IE.matcher(outputsRow[outputsRow.length-1]); - F = true; - } - } - - if( (!paragraph_num_last.equals("")) && (!paragraph_num.equals(paragraph_num_last)) ) // paragraph change - { - countPassage++; - } - - if( !pmid.equals(pmid_last) && paragraph_num.equals("0") && paragraph_num_last.equals("0") ) // pmid change (special case : the article only has one paragrpah) - { - countPMID++; - countPassage=0; - } - else if( (!pmid_last.equals("")) && (!pmid.equals(pmid_last)) ) // pmid change - { - countPMID++; - countPassage=0; - } - - if(F == true) - { - if(GNormPlus.BioCDocobj.PassageContexts.size()>countPMID && GNormPlus.BioCDocobj.PassageContexts.get(countPMID).size()>countPassage && GNormPlus.BioCDocobj.PassageContexts.get(countPMID).get(countPassage).length()>=last && (last-start)<1000) - { - String PassageContext = GNormPlus.BioCDocobj.PassageContexts.get(countPMID).get(countPassage); // Passage context - String Mention = PassageContext.substring(start, last); - String Mention_nospace = Mention.replaceAll("[\\W\\-\\_]", ""); - if(Mention.toLowerCase().matches("(figure|tables|fig|tab|exp\\. [0-9]+).*")){} - else if(Mention.matches("[A-Z][A-Z]s")){} - else if(Mention.matches(".*\\|.*")){} - else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\;\\,\\'\\/\\\\].*")){} - else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\(].*") && !Mention.matches(".*[\\)].*")){} - else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\[].*") && !Mention.matches(".*[\\]].*")){} - else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\)].*") && !Mention.matches(".*[\\(].*")){} - else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\]].*") && !Mention.matches(".*[\\[].*")){} - else if((GNormPlus.Abb2Longformtok_hash.containsKey(Mention_nospace.toLowerCase())) && (PassageContext.toLowerCase().matches(".*[\\W\\-\\-]("+GNormPlus.Abb2Longformtok_hash.get(Mention_nospace.toLowerCase())+")[\\W\\-\\-].*"))) - { - //System.out.println(Mention_nospace.toLowerCase()+"\t"+GNormPlus.Abb2Longformtok_hash.get(Mention_nospace.toLowerCase())); - } - else if(Double.parseDouble(outputArr3_score.get(i))>threshold) - { - boolean overlap=false; - for(int j=0;jthreshold_GeneType && GNormPlus.BioCDocobj.Annotations.get(countPMID).get(countPassage).get(j).matches(start+"\t"+last+"\t"+Mention_tmp+"\t(FamilyName|DomainMotif)") ) - { - GNormPlus.BioCDocobj.Annotations.get(countPMID).get(countPassage).set(j, start+"\t"+last+"\t"+Mention+"\t"+MentionType); - } - else if( (start>=startj && startstartj && last<=lastj) ) - { - overlap=true; - } - } - if(overlap == false) - { - GNormPlus.BioCDocobj.Annotations.get(countPMID).get(countPassage).add(start+"\t"+last+"\t"+Mention+"\t"+MentionType); - } - } - } - i--; - } - - paragraph_num_last=paragraph_num; - pmid_last=pmid; - }// outputArr3 - - //GNormPlus.BioCDocobj.BioCOutput(Filename,FilenameBioC,GNormPlus.BioCDocobj.Annotations,false,false); //save in BioC file - } - - public void PostProcessing(String Filename,String FilenameBioC) throws XMLStreamException, IOException - { - /** Develop Cell | FamilyName | DomainMotif lists */ - String Disease_Suffix="disease|diseases|syndrome|syndromes|tumor|tumour|deficiency|dysgenesis|atrophy|frame|dystrophy"; - String Cell_Suffix="cell|cells"; - String FamilyName_Suffix="disease|diseases|syndrome|syndromes|tumor|tumour|deficiency|dysgenesis|atrophy|frame|dystrophy|frame|factors|family|families|superfamily|superfamilies|subfamily|subfamilies|complex|genes|proteins"; - String DomainMotif_Suffix="domain|motif|domains|motifs|sequences"; - String Strain_Suffix="alpha|beta|gamma|kappa|theta|delta|[A-Ga-g0-9]"; - ArrayList Translate2Family = new ArrayList(); - - for(int i=0;i Mention2Type_Hash = new HashMap(); // for substring detection - Extract all mentions in the target PMID : MentionList - ArrayList GeneMentionPattern = new ArrayList(); // pattern match to extend Gene - HashMap MentionType2Num = new HashMap(); // for frequency calculation - if(GNormPlus.BioCDocobj.PMIDs.size()>=i) - { - String pmid=GNormPlus.BioCDocobj.PMIDs.get(i); - for(int j=0;j RemoveList = new ArrayList(); - for(int k=0;k Family name (TIF & TIF1) */ - boolean SubSt=false; - /* - // GDNFb -> GDNF (not work on 12682085_J_Cell_Biol_2003.xml) - for (String men : Mention2Type_Hash.keySet()) - { - if((!men.equals(mention.toLowerCase())) && men.matches(mention_tmp+"[\\W\\-\\_]*("+Strain_Suffix+")")) - { - GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, start+"\t"+last+"\t"+mention+"\tFamilyName"); - if(GNormPlus.PmidLF2Abb_lc_hash.containsKey(GNormPlus.BioCDocobj.PMIDs.get(i)+"\t"+mention.toLowerCase())) - { - Translate2Family.add(GNormPlus.PmidLF2Abb_lc_hash.get(GNormPlus.BioCDocobj.PMIDs.get(i)+"\t"+mention.toLowerCase())); - } - else if(GNormPlus.PmidAbb2LF_lc_hash.containsKey(GNormPlus.BioCDocobj.PMIDs.get(i)+"\t"+mention.toLowerCase())) - { - Translate2Family.add(GNormPlus.PmidAbb2LF_lc_hash.get(GNormPlus.BioCDocobj.PMIDs.get(i)+"\t"+mention.toLowerCase())); - } - SubSt=true; - break; - } - } - */ - if(SubSt == false) - { - int BoundaryLen=15; - if(GNormPlus.BioCDocobj.PassageContexts.get(i).get(j).length() Family/Domain/Cell */ - if( mention.toLowerCase().matches(".*("+Cell_Suffix+")") || SurroundingString.matches("("+Cell_Suffix+")") ) - { - type="Cell"; - GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, start+"\t"+last+"\t"+mention+"\t"+type); - } - else if( mention.toLowerCase().matches(".*("+FamilyName_Suffix+")") || SurroundingString.matches("("+FamilyName_Suffix+")") ) - { - type="FamilyName"; - GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, start+"\t"+last+"\t"+mention+"\t"+type); - } - else if( mention.toLowerCase().matches(".*("+DomainMotif_Suffix+")")|| SurroundingString.matches("("+DomainMotif_Suffix+")") ) - { - type="DomainMotif"; - GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, start+"\t"+last+"\t"+mention+"\t"+type); - } - else if(!type.equals("Gene")) - { - /* 3. Check (Family+Domain+Cell)/All rate (threshold = 0.5) - Family/Domain/Cell -> Gene */ - double Num_FDC=0; - double Num_Gene=0; - if(MentionType2Num.containsKey(mention.toLowerCase()+"\tFamilyName")) - { - Num_FDC = Num_FDC + MentionType2Num.get(mention.toLowerCase()+"\tFamilyName"); - } - if(MentionType2Num.containsKey(mention.toLowerCase()+"\tDomainMotif")) - { - Num_FDC = Num_FDC + MentionType2Num.get(mention.toLowerCase()+"\tDomainMotif"); - } - if(MentionType2Num.containsKey(mention.toLowerCase()+"\tCell")) - { - Num_FDC = Num_FDC + MentionType2Num.get(mention.toLowerCase()+"\tCell"); - } - if(MentionType2Num.containsKey(mention.toLowerCase()+"\tGene")) - { - Num_Gene = Num_Gene + MentionType2Num.get(mention.toLowerCase()+"\tGene"); - } - if(Num_Gene/(Num_FDC+Num_Gene)>=0.5) - { - GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, start+"\t"+last+"\t"+mention+"\tGene"); - } - - /* 4. Extend Genes to Family/Domain mentions by pattern match - Family/Domain/Cell -> Gene */ - for(int p=0;p Abb.type - * - Abb only : Abb.type -> LF.type - * - LF only : LF.type -> Abb.type - */ - String lc_ment=mention.toLowerCase(); - if(GNormPlus.PmidAbb2LF_lc_hash.containsKey(pmid+"\t"+lc_ment)) //the target mention is abbreviation - { - //Infer Abbreviation by Long form - if(GNormPlus.PmidAbb2LF_lc_hash.get(pmid+"\t"+lc_ment).matches(".*("+Disease_Suffix+")")) - { - //remove the mention (Abb), because the LF is a disease - } - else if(GNormPlus.PmidAbb2LF_lc_hash.get(pmid+"\t"+lc_ment).matches(".*("+Cell_Suffix+")")) - { - //GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, Anno[0]+"\t"+Anno[1]+"\tCell"); - } - else if(GNormPlus.PmidAbb2LF_lc_hash.get(pmid+"\t"+lc_ment).matches(".*("+FamilyName_Suffix+")") && !lc_ment.matches(".+[a-z][0-9][a-z]")) //AtRPA1a in pmid:19153602 - { - GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, start+"\t"+last+"\t"+mention+"\tFamilyName"); - } - else if(GNormPlus.PmidAbb2LF_lc_hash.get(pmid+"\t"+lc_ment).matches(".*("+DomainMotif_Suffix+")")) - { - GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, start+"\t"+last+"\t"+mention+"\tDomainMotif"); - } - else - { - if(Mention2Type_Hash.containsKey(GNormPlus.PmidAbb2LF_lc_hash.get(pmid+"\t"+lc_ment)) - && Mention2Type_Hash.get(GNormPlus.PmidAbb2LF_lc_hash.get(pmid+"\t"+lc_ment)).equals("Gene") - && !(type.equals("Gene")) - ) // if Long Form is recognized as a Gene, and Abb is recognized as not a Gene - { - GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, start+"\t"+last+"\t"+mention+"\tGene"); - } - } - } - } //if(Remov == true) - } - } - - for(int j=0;j GeneMentionPattern = new ArrayList(); // pattern match to extend Gene - HashMap GeneMentions = new HashMap(); // Extending Gene mentions - HashMap GeneMentionLocationGNR = new HashMap(); // Extending Gene mentions - for(int j=0;ji && GNormPlus.BioCDocobj.PassageContexts.get(i).size()>j) - { - String PassageContexts = " " + GNormPlus.BioCDocobj.PassageContexts.get(i).get(j) + " "; - String PassageContexts_tmp = PassageContexts.toLowerCase(); - for(String gm : GeneMentions.keySet()) - { - String type=GeneMentions.get(gm); - if(type.equals("Gene")) - { - gm = gm.replaceAll("([\\W\\-\\_])", "\\\\$1"); - gm=gm.replaceAll("[0-9]", "\\[0\\-9\\]"); - gm=gm.replaceAll("(alpha|beta|gamma|theta|zeta|delta)", "(alpha\\|beta\\|gamma\\|theta\\|zeta\\|delta)"); - gm=gm.replaceAll("\\-[a-z]$", "\\-\\[a\\-z\\]"); - Pattern ptmp = Pattern.compile("^(.*[\\W\\-\\_])("+gm+")([\\W\\-\\_].*)$"); - Matcher mtmp = ptmp.matcher(PassageContexts_tmp); - while(mtmp.find()) - { - String pre = mtmp.group(1); - String gmtmp = mtmp.group(2); - String post = mtmp.group(3); - - int start = pre.length()-1; - int last = start+gmtmp.length(); - if(PassageContexts.length()>last) - { - String mention = PassageContexts.substring(start+1,last+1); - if(!GeneMentionLocationGNR.containsKey(j+"\t"+start) && !GeneMentionLocationGNR.containsKey(j+"\t"+last)) - { - if(GNormPlus.BioCDocobj.Annotations.get(i).get(j).contains(start+"\t"+last+"\t"+mention+"\tFamilyName")) - { - GNormPlus.BioCDocobj.Annotations.get(i).get(j).remove(start+"\t"+last+"\t"+mention+"\tFamilyName"); - } - else if(GNormPlus.BioCDocobj.Annotations.get(i).get(j).contains(start+"\t"+last+"\t"+mention+"\tDomainMotif")) - { - GNormPlus.BioCDocobj.Annotations.get(i).get(j).remove(start+"\t"+last+"\t"+mention+"\tDomainMotif"); - } - GNormPlus.BioCDocobj.Annotations.get(i).get(j).add(start+"\t"+last+"\t"+mention+"\tGene"); - } - gmtmp = gmtmp.replaceAll(".", "X"); - PassageContexts_tmp=pre+""+gmtmp+""+post; - mtmp = ptmp.matcher(PassageContexts_tmp); - } - } - } - } - } - } - - //Extend to all family mentions - for(int j=0;ji && GNormPlus.BioCDocobj.PassageContexts.get(i).size()>j) - { - String PassageContexts = " " + GNormPlus.BioCDocobj.PassageContexts.get(i).get(j) + " "; - String PassageContexts_tmp = PassageContexts.toLowerCase(); - for(String gm : GeneMentions.keySet()) - { - String type=GeneMentions.get(gm); - if(type.matches("(FamilyName|DomainMotif)")) - { - gm = gm.replaceAll("([\\W\\-\\_])", "\\\\$1"); - gm=gm.replaceAll("s$", "(s\\|)"); - Pattern ptmp = Pattern.compile("^(.*[\\W\\-\\_])("+gm+")([\\W\\-\\_].*)$"); - Matcher mtmp = ptmp.matcher(PassageContexts_tmp); - while(mtmp.find()) - { - String pre = mtmp.group(1); - String gmtmp = mtmp.group(2); - String post = mtmp.group(3); - - int start = pre.length()-1; - int last = start+gmtmp.length(); - if(PassageContexts.length()>last) - { - String mention = PassageContexts.substring(start+1,last+1); - if(!GeneMentionLocationGNR.containsKey(j+"\t"+start) && !GeneMentionLocationGNR.containsKey(j+"\t"+last)) - { - if(!GNormPlus.BioCDocobj.Annotations.get(i).get(j).contains(start+"\t"+last+"\t"+mention+"\tGene")) - { - GNormPlus.BioCDocobj.Annotations.get(i).get(j).add(start+"\t"+last+"\t"+mention+"\t"+type); - } - } - gmtmp = gmtmp.replaceAll(".", "X"); - PassageContexts_tmp=pre+""+gmtmp+""+post; - mtmp = ptmp.matcher(PassageContexts_tmp); - } - } - } - } - } - } - } - } - GNormPlus.BioCDocobj.BioCOutput(Filename,FilenameBioC,GNormPlus.BioCDocobj.Annotations,false,false); //save in BioC file - } -} - - +/** + * Project: GNormPlus + * Function: Gene Name Recognition + */ + +package GNormPluslib; + +import java.io.*; +import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import javax.xml.stream.XMLStreamException; + +import org.tartarus.snowball.SnowballStemmer; +import org.tartarus.snowball.ext.englishStemmer; + +import GNormPluslib.GNormPlus; +import GNormPluslib.BioCDoc; + +public class GNR +{ + /* + * Read BioC files + */ + public void Ab3P(String Filename,String FilenameAbb,String TrainTest) throws XMLStreamException,IOException + { + /** Abbreviation*/ + //BioC -> Abb input + String line=""; + BufferedWriter FileAbb = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(FilenameAbb), "UTF-8")); + for (int i = 0; i < GNormPlus.BioCDocobj.PMIDs.size(); i++) + { + String Pmid = GNormPlus.BioCDocobj.PMIDs.get(i); + String Context=""; + for (int j = 0; j < GNormPlus.BioCDocobj.PassageNames.get(i).size(); j++) + { + String PassageContext=GNormPlus.BioCDocobj.PassageContexts.get(i).get(j); + if(PassageContext.matches(".*\\([^\\(\\)]+,[^\\(\\)]+\\).*")) + { + PassageContext=PassageContext.replaceAll("\\([^\\(\\)]+,[^\\(\\)]+\\)", ""); + } + if(PassageContext.contains("\\(")) + { + Context = Context+PassageContext+" "; + } + } + FileAbb.write(Pmid+"\n"+Context+"\n\n"); + } + FileAbb.close(); + //Abb + File f = new File(FilenameAbb+".out"); + BufferedWriter fr = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(f), "UTF-8")); + Runtime runtime = Runtime.getRuntime(); + String cmd ="./Ab3P "+FilenameAbb+".Abb "+FilenameAbb+".out"; + + String OS=System.getProperty("os.name").toLowerCase(); + if(OS.contains("windows")) + { + cmd ="java -jar bioadi.jar "+FilenameAbb; + } + else //if(OS.contains("nux")||OS.contains("nix")) + { + cmd ="./Ab3P "+FilenameAbb+" "+FilenameAbb+".out"; + //cmd ="java -jar bioadi.jar "+FilenameAbb+" > "+FilenameAbb+".out"; + } + + Process process = runtime.exec(cmd); + InputStream is = process.getInputStream(); + InputStreamReader isr = new InputStreamReader(is, "UTF-8"); + BufferedReader br = new BufferedReader(isr); + line=""; + while ( (line = br.readLine()) != null) + { + fr.write(line); + fr.newLine(); + fr.flush(); + } + is.close(); + isr.close(); + br.close(); + fr.close(); + //Abb output -> Hash + BufferedReader inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(FilenameAbb+".out"), "UTF-8")); + line=""; + String pmid=""; + while ((line = inputfile.readLine()) != null) + { + String patt="^ (.+)\\|(.+)\\|([0-9\\.]+)$"; + Pattern ptmp = Pattern.compile(patt); + Matcher mtmp = ptmp.matcher(line); + if(line.matches("^[0-9]+$")) + { + pmid=line; + } + if(mtmp.find()) + { + String SF = mtmp.group(1); + String LF = mtmp.group(2); + double weight= Double.parseDouble(mtmp.group(3)); + GNormPlus.Pmid2Abb_hash.put(pmid+"\t"+SF, "Abb:SF"); + GNormPlus.Pmid2Abb_hash.put(pmid+"\t"+LF, "Abb:LF"); + GNormPlus.PmidLF2Abb_lc_hash.put(pmid+"\t"+LF.toLowerCase(), SF.toLowerCase()); + GNormPlus.PmidAbb2LF_lc_hash.put(pmid+"\t"+SF.toLowerCase(), LF.toLowerCase()); + GNormPlus.PmidAbb2LF_hash.put(pmid+"\t"+SF, LF); + if(weight >= 0.9) + { + GNormPlus.PmidLF2Abb_hash.put(pmid+"\t"+LF, SF); + } + } + } + inputfile.close(); + } + + public void LoadInputFile(String Filename,String FilenameAbb,String TrainTest) throws XMLStreamException,IOException + { + /** Read BioC file */ + //if(TrainTest.equals("Train")) + //{ + GNormPlus.BioCDocobj.BioCReaderWithAnnotation(Filename); + //} + //else + //{ + // GNormPlus.BioCDocobj.BioCReader(Filename); + //} + + + /** Abbreviation*/ + //BioC -> Abb input + String line=""; + BufferedWriter FileAbb = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(FilenameAbb), "UTF-8")); + for (int i = 0; i < GNormPlus.BioCDocobj.PMIDs.size(); i++) + { + String Pmid = GNormPlus.BioCDocobj.PMIDs.get(i); + String Context="Text:"; + for (int j = 0; j < GNormPlus.BioCDocobj.PassageNames.get(i).size(); j++) + { + String PassageContext=GNormPlus.BioCDocobj.PassageContexts.get(i).get(j); + if(PassageContext.matches(".*\\([^\\(\\)]+,[^\\(\\)]+\\).*")) + { + PassageContext=PassageContext.replaceAll("\\([^\\(\\)]+,[^\\(\\)]+\\)", ""); + } + if(PassageContext.contains("(")) + { + Context = Context+PassageContext+" "; + } + } + FileAbb.write(Pmid+"\n"+Context+"\n\n"); + } + FileAbb.close(); + //Abb + File f = new File(FilenameAbb+".out"); + BufferedWriter fr = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(f), "UTF-8")); + Runtime runtime = Runtime.getRuntime(); + String cmd ="./Ab3P "+FilenameAbb+".Abb "+FilenameAbb+".out"; + + String OS=System.getProperty("os.name").toLowerCase(); + if(OS.contains("windows")) + { + cmd ="java -jar bioadi.jar "+FilenameAbb; + } + else //if(OS.contains("nux")||OS.contains("nix")) + { + cmd ="./Ab3P "+FilenameAbb+" "+FilenameAbb+".out"; + //cmd ="java -jar bioadi.jar "+FilenameAbb+" > "+FilenameAbb+".out"; + } + + Process process = runtime.exec(cmd); + InputStream is = process.getInputStream(); + InputStreamReader isr = new InputStreamReader(is, "UTF-8"); + BufferedReader br = new BufferedReader(isr); + line=""; + while ( (line = br.readLine()) != null) + { + fr.write(line); + fr.newLine(); + fr.flush(); + } + is.close(); + isr.close(); + br.close(); + fr.close(); + //Abb output -> Hash + BufferedReader inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(FilenameAbb+".out"), "UTF-8")); + line=""; + String pmid=""; + while ((line = inputfile.readLine()) != null) + { + String patt="^ (.+)\\|(.+)\\|([0-9\\.]+)$"; + Pattern ptmp = Pattern.compile(patt); + Matcher mtmp = ptmp.matcher(line); + if(line.matches("^[0-9]+$")) + { + pmid=line; + } + if(mtmp.find()) + { + String SF = mtmp.group(1); + String LF = mtmp.group(2); + double weight= Double.parseDouble(mtmp.group(3)); + GNormPlus.Pmid2Abb_hash.put(pmid+"\t"+SF, "Abb:SF"); + GNormPlus.Pmid2Abb_hash.put(pmid+"\t"+LF, "Abb:LF"); + GNormPlus.PmidLF2Abb_lc_hash.put(pmid+"\t"+LF.toLowerCase(), SF.toLowerCase()); + GNormPlus.PmidAbb2LF_lc_hash.put(pmid+"\t"+SF.toLowerCase(), LF.toLowerCase()); + GNormPlus.PmidAbb2LF_hash.put(pmid+"\t"+SF, LF); + if(weight >= 0.9) + { + GNormPlus.PmidLF2Abb_hash.put(pmid+"\t"+LF, SF); + } + } + } + inputfile.close(); + } + + /* + * Feature Extraction + */ + public void FeatureExtraction(String FilenameData,String FilenameLoca,String TrainTest) throws XMLStreamException + { + try + { + /** output files */ + BufferedWriter FileLocation = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(FilenameLoca), "UTF-8")); // .location + BufferedWriter FileData = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(FilenameData), "UTF-8")); // .data + //NLP modules + SnowballStemmer stemmer = new englishStemmer(); + /** PMIDs : i */ + for (int i = 0; i < GNormPlus.BioCDocobj.PMIDs.size(); i++) + { + String Pmid = GNormPlus.BioCDocobj.PMIDs.get(i); + + /** Paragraphs : j */ + for (int j = 0; j < GNormPlus.BioCDocobj.PassageNames.get(i).size(); j++) + { + String PassageName= GNormPlus.BioCDocobj.PassageNames.get(i).get(j); // Passage name + int PassageOffset = GNormPlus.BioCDocobj.PassageOffsets.get(i).get(j); // Passage offset + String PassageContext = GNormPlus.BioCDocobj.PassageContexts.get(i).get(j); // Passage context + ArrayList Annotation = GNormPlus.BioCDocobj.Annotations.get(i).get(j); // Annotation + HashMap CTDGene_hash = new HashMap(); + HashMap FamilyName_hash = new HashMap(); + HashMap character_hash = new HashMap(); + HashMap Abbreviation_hash = new HashMap(); + String PassageContext_tmp=" "+PassageContext+" "; + + /** Abbreviation */ + HashMap Abb_sortebylength = new HashMap(); + ArrayList length_list = new ArrayList(); + int countn=0; + for (Object key : GNormPlus.Pmid2Abb_hash.keySet()) + { + String pmid2abb[]=key.toString().split("\t"); + if(Pmid.equals(pmid2abb[0])) + { + Abb_sortebylength.put(pmid2abb[1].length()*100+countn, pmid2abb[1]); + length_list.add(pmid2abb[1].length()*100+countn); + countn++; + } + } + Collections.sort(length_list); + for (int l=length_list.size()-1;l>=0;l--) + { + String AbbLF = Abb_sortebylength.get(length_list.get(l)); + AbbLF=AbbLF.replaceAll("([^A-Za-z0-9@ ])", "\\\\$1"); + AbbLF=AbbLF.replaceAll(" ", "\\[ \\]\\+"); + Pattern ptmp = Pattern.compile("^(.*[^A-Za-z0-9]+)("+AbbLF+")([^A-Za-z0-9]+.*)$"); + Matcher mtmp = ptmp.matcher(PassageContext_tmp); + while(mtmp.find()) + { + String str1=mtmp.group(1); + String str2=mtmp.group(2); + String str3=mtmp.group(3); + for(int m=str1.length();m<=(str1.length()+str2.length());m++) + { + Abbreviation_hash.put((m-1),GNormPlus.Pmid2Abb_hash.get(Pmid+"\t"+Abb_sortebylength.get(length_list.get(l)))); + } + String men=""; + for(int m=0;m locations = GNormPlus.PT_CTDGene.SearchMentionLocation(PassageContext,"CTDGene"); + for (int k = 0 ; k < locations.size() ; k++) + { + String anno[]=locations.get(k).split("\t"); + int start= Integer.parseInt(anno[0]) + PassageOffset; + int last= Integer.parseInt(anno[1]) + PassageOffset; + String mention = anno[2]; + String id = anno[3]; + + CTDGene_hash.put(start,"CTDGene_B"); + CTDGene_hash.put(last,"CTDGene_E"); + for(int s=start+1;s locations_Fname = GNormPlus.PT_FamilyName.SearchMentionLocation(PassageContext,"FamilyName"); + for (int k = 0 ; k < locations_Fname.size() ; k++) + { + String anno[]=locations_Fname.get(k).split("\t"); + int start= Integer.parseInt(anno[0]) + PassageOffset; + int last= Integer.parseInt(anno[1]) + PassageOffset; + String mention = anno[2]; + String id = anno[3]; + + if(!CTDGene_hash.containsKey(start)) + { + FamilyName_hash.put(start,"famplex_B"); + FamilyName_hash.put(last,"famplex_E"); + for(int s=start+1;stokens[p].length() && PassageContext_tmp.substring(tokens[p].length(),tokens[p].length()+1).equals(" ")) + { + WSF="WSF:Gap"; + } + if(p==0) + { + WSB="WSB:1st"; + } + else if(p==tokens.length-1) + { + WSF="WSF:last"; + } + + if(PassageContext_tmp.substring(0,tokens[p].length()).equals(tokens[p])) + { + if(tokens[p].length()>0) + { + /* + * .loca + */ + int start=Offset; + int last=Offset+tokens[p].length(); + String State=""; + if(!character_hash.containsKey(start) || !character_hash.containsKey(last)){} + else if(character_hash.get(start).matches(".*B$")) + { + State=character_hash.get(start); + } + else if(character_hash.get(last).matches(".*E$")) + { + State=character_hash.get(last); + } + else if(character_hash.get(start).matches(".*I$")) + { + State=character_hash.get(start); + } + + if((!tokens[p].equals("\t"))) + { + FileLocation.write(Pmid+"\t"+PassageName+"\t"+j+"\t"+tokens[p]+"\t"+(Offset+1)+"\t"+(Offset+tokens[p].length())+"\t"+State+"\n"); + } + + /* + * .data + */ + + //Abbreviation + String Abb_State="__nil__"; + if(!Abbreviation_hash.containsKey(start) || !Abbreviation_hash.containsKey(last)){Abb_State="__nil__";} + else if(Abbreviation_hash.containsKey(start)) + { + Abb_State=Abbreviation_hash.get(start); + } + + //CTDGene + start=PassageOffset+Offset; + last=PassageOffset+Offset+tokens[p].length(); + String CTDGene_State="__nil__"; + if(!CTDGene_hash.containsKey(start) || !CTDGene_hash.containsKey(last)){CTDGene_State="__nil__";} + else if(CTDGene_hash.get(start).matches(".*B$")) + { + CTDGene_State=CTDGene_hash.get(start); + } + else if(CTDGene_hash.get(last).matches(".*E$")) + { + CTDGene_State=CTDGene_hash.get(last); + } + else if(CTDGene_hash.get(start).matches(".*I$")) + { + CTDGene_State=CTDGene_hash.get(start); + } + + //FamilyName + if(CTDGene_State.equals("__nil__")) + { + start=PassageOffset+Offset; + last=PassageOffset+Offset+tokens[p].length(); + if(!FamilyName_hash.containsKey(start) || !FamilyName_hash.containsKey(last)){} + else if(FamilyName_hash.get(start).matches(".*B$")) + { + CTDGene_State=FamilyName_hash.get(start); + } + else if(FamilyName_hash.get(last).matches(".*E$")) + { + CTDGene_State=FamilyName_hash.get(last); + } + else if(FamilyName_hash.get(start).matches(".*I$")) + { + CTDGene_State=FamilyName_hash.get(start); + } + } + + //stemming + stemmer.setCurrent(tokens[p].toLowerCase()); + stemmer.stem(); + String stem=stemmer.getCurrent(); + + //Number of Numbers [0-9] + String Num_num=""; + String tmp=tokens[p]; + tmp=tmp.replaceAll("[^0-9]",""); + if(tmp.length()>3){Num_num="N:4+";}else{Num_num="N:"+ tmp.length();} + + //Number of Uppercase [A-Z] + String Num_Uc=""; + tmp=tokens[p]; + tmp=tmp.replaceAll("[^A-Z]",""); + if(tmp.length()>3){Num_Uc="U:4+";}else{Num_Uc="U:"+ tmp.length();} + + //Number of Lowercase [a-z] + String Num_lc=""; + tmp=tokens[p]; + tmp=tmp.replaceAll("[^a-z]",""); + if(tmp.length()>3){Num_lc="L:4+";}else{Num_lc="L:"+ tmp.length();} + + //Number of ALL char + String Num_All=""; + if(tokens[p].length()>3){Num_All="A:4+";}else{Num_All="A:"+ tokens[p].length();} + + //specific character (;:,.->+_) + String SpecificC="__nil__"; + if(tokens[p].equals(";") || tokens[p].equals(":") || tokens[p].equals(",") || tokens[p].equals(".") || tokens[p].equals("-") || tokens[p].equals(">") || tokens[p].equals("+") || tokens[p].equals("_")) + { + SpecificC="-SpecificC1-"; + } + else if(tokens[p].equals("(") || tokens[p].equals(")")) + { + SpecificC="-SpecificC2-"; + } + else if(tokens[p].equals("{") || tokens[p].equals("}")) + { + SpecificC="-SpecificC3-"; + } + else if(tokens[p].equals("[") || tokens[p].equals("]")) + { + SpecificC="-SpecificC4-"; + } + else if(tokens[p].equals("\\") || tokens[p].equals("/")) + { + SpecificC="-SpecificC5-"; + } + + //Chemical Prefix/Suffix + String ChemPreSuf="__nil__"; + if(tokens[p].matches(".*(yl|ylidyne|oyl|sulfonyl)")){ChemPreSuf="-CHEMinlineSuffix-";} + else if(tokens[p].matches("(meth|eth|prop|tetracos).*")){ChemPreSuf="-CHEMalkaneStem-";} + else if(tokens[p].matches("(di|tri|tetra).*")){ChemPreSuf="-CHEMsimpleMultiplier-";} + else if(tokens[p].matches("(benzen|pyridin|toluen).*")){ChemPreSuf="-CHEMtrivialRing-";} + else if(tokens[p].matches(".*(one|ol|carboxylic|amide|ate|acid|ium|ylium|ide|uide|iran|olan|inan|pyrid|acrid|amid|keten|formazan|fydrazin)(s|)")){ChemPreSuf="-CHEMsuffix-";} + + + //Mention Type + String MentionType="__nil__"; + /* + if($tmp eq "to" && $CTD_result_hash{$count_token-1} eq "CTD_gene" && $CTD_result_hash{$count_token+1} eq "CTD_gene"){$CTD_result_hash{$count_token}="CTD_gene";} + if($tmp=~/^(or|and|,)$/ && $CTD_result_hash{$count_token-1} eq "CTD_gene" && $CTD_result_hash{$count_token+1} eq "CTD_gene"){$MentionType="-Type_GeneConjunction-";} + elsif($tmp=~/^(or|and|,)$/ && $last_token=~/^(or|and|,)$/ && $CTD_result_hash{$count_token-2} eq "CTD_gene" && $CTD_result_hash{$count_token+1} eq "CTD_gene"){$MentionType="-Type_GeneConjunction-";} + elsif($tmp=~/^(or|and|,)$/ && $next_token=~/^(or|and|,)$/ && $CTD_result_hash{$count_token-1} eq "CTD_gene" && $CTD_result_hash{$count_token+2} eq "CTD_gene"){$MentionType="-Type_GeneConjunction-";} + */ + if(tokens[p].matches("(ytochrome|cytochrome)")){MentionType="-Type_cytochrome-";} + else if(tokens[p].matches(".*target") ){MentionType="-Type_target-";} + else if(tokens[p].matches(".*(irradiation|hybrid|fusion|experiment|gst|est|gap|antigen)") ){MentionType="-Type_ExperimentNoun-";} + else if(tokens[p].matches(".*(disease|disorder|dystrophy|deficiency|syndrome|dysgenesis|cancer|injury|neoplasm|diabetes|diabete)") ){MentionType="-Type_Disease-";} + else if(tokens[p].matches(".*(motif|domain|omain|binding|site|region|sequence|frameshift|finger|box).*") ){MentionType="-Type_DomainMotif-";} + else if(tokens[p].equals("-") && (p0 && tokens[p-1].matches("^[0-9]+$")) ) ){MentionType="-Type_ChromosomeStrain-";} + else if(tokens[p].matches(".*(related|regulated|associated|correlated|reactive).*")){MentionType="-Type_relation-";} + else if(tokens[p].toLowerCase().matches(".*(polymorphism|mutation|deletion|insertion|duplication|genotype|genotypes).*") ){MentionType="-Type_VariationTerms-";} + else if(tokens[p].matches(".*(oxidase|transferase|transferases|kinase|kinese|subunit|unit|receptor|adrenoceptor|transporter|regulator|transcription|antigen|protein|gene|factor|member|molecule|channel|deaminase|spectrin).*") ){MentionType="-Type_suffix-";} + else if(tokens[p].matches("[\\(\\-\\_]") && (p=1){ prefix=tmp.substring(0, 1);}else{prefix="__nil__";} + if(tmp.length()>=2){ prefix=prefix+" "+tmp.substring(0, 2);}else{prefix=prefix+" __nil__";} + if(tmp.length()>=3){ prefix=prefix+" "+tmp.substring(0, 3);}else{prefix=prefix+" __nil__";} + if(tmp.length()>=4){ prefix=prefix+" "+tmp.substring(0, 4);}else{prefix=prefix+" __nil__";} + if(tmp.length()>=5){ prefix=prefix+" "+tmp.substring(0, 5);}else{prefix=prefix+" __nil__";} + + + //suffix + String suffix=""; + tmp=tokens[p]; + if(tmp.length()>=1){ suffix=tmp.substring(tmp.length()-1, tmp.length());}else{suffix="__nil__";} + if(tmp.length()>=2){ suffix=suffix+" "+tmp.substring(tmp.length()-2, tmp.length());}else{suffix=suffix+" __nil__";} + if(tmp.length()>=3){ suffix=suffix+" "+tmp.substring(tmp.length()-3, tmp.length());}else{suffix=suffix+" __nil__";} + if(tmp.length()>=4){ suffix=suffix+" "+tmp.substring(tmp.length()-4, tmp.length());}else{suffix=suffix+" __nil__";} + if(tmp.length()>=5){ suffix=suffix+" "+tmp.substring(tmp.length()-5, tmp.length());}else{suffix=suffix+" __nil__";} + + if(State.equals("")) + { + State="O"; + } + + if((!tokens[p].equals("\t"))) + { + if(TrainTest.equals("Train")) + { + FileData.write(tokens[p]+" "+stem+" "+WSB+" "+WSF+" "+Num_num+" "+Num_Uc+" "+Num_lc+" "+Num_All+" "+SpecificC+" "+ChemPreSuf+" "+MentionType+" "+ProteinSym+" "+prefix+" "+suffix+" "+CTDGene_State+" "+Abb_State+" "+State+"\n"); + } + else + { + FileData.write(tokens[p]+" "+stem+" "+WSB+" "+WSF+" "+Num_num+" "+Num_Uc+" "+Num_lc+" "+Num_All+" "+SpecificC+" "+ChemPreSuf+" "+MentionType+" "+ProteinSym+" "+prefix+" "+suffix+" "+CTDGene_State+" "+Abb_State+"\n"); + } + } + PassageContext_tmp=PassageContext_tmp.substring(tokens[p].length()); // remove the token for the context + Offset=Offset+tokens[p].length(); + } + } + } + if(tokens.length>0) + { + FileLocation.write("\n"); + FileData.write("\n"); + } + } + } + FileLocation.close(); + FileData.close(); + } + catch(IOException e1){ System.out.println("[MR]: Input file is not exist.");} + } + /* + * Testing by CRF++ + */ + public void CRF_test(String model, String FilenameData, String FilenameOutput) throws IOException + { + File f = new File(FilenameOutput); + BufferedWriter fr = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(f), "UTF-8")); + + Runtime runtime = Runtime.getRuntime(); + + String OS=System.getProperty("os.name").toLowerCase(); + + String cmd="./CRF/crf_test -m "+model+" -o "+FilenameOutput+" "+FilenameData; + if(OS.contains("windows")) + { + cmd ="CRF/crf_test -m "+model+" -o "+FilenameOutput+" "+FilenameData; + } + else //if(OS.contains("nux")||OS.contains("nix")) + { + cmd ="./CRF/crf_test -m "+model+" -o "+FilenameOutput+" "+FilenameData; + } + + try { + Process process = runtime.exec(cmd); + InputStream is = process.getInputStream(); + InputStreamReader isr = new InputStreamReader(is, "UTF-8"); + BufferedReader br = new BufferedReader(isr); + String line=""; + while ( (line = br.readLine()) != null) + { + fr.write(line); + fr.newLine(); + fr.flush(); + } + is.close(); + isr.close(); + br.close(); + fr.close(); + } + catch (IOException e) { + System.out.println(e); + runtime.exit(0); + } + } + + public void CRF_test(String model,String FilenameData,String FilenameOutput,String top3) throws IOException + { + File f = new File(FilenameOutput); + BufferedWriter fr = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(f), "UTF-8")); + + Runtime runtime = Runtime.getRuntime(); + + String OS=System.getProperty("os.name").toLowerCase(); + + String cmd="./CRF/crf_test -n 3 -m "+model+" -o "+FilenameOutput+" "+FilenameData; + if(OS.contains("windows")) + { + cmd ="CRF/crf_test -n 3 -m "+model+" -o "+FilenameOutput+" "+FilenameData; + } + else //if(OS.contains("nux")||OS.contains("nix")) + { + cmd ="./CRF/crf_test -n 3 -m "+model+" -o "+FilenameOutput+" "+FilenameData; + } + + try { + Process process = runtime.exec(cmd); + InputStream is = process.getInputStream(); + InputStreamReader isr = new InputStreamReader(is, "UTF-8"); + BufferedReader br = new BufferedReader(isr); + String line=""; + while ( (line = br.readLine()) != null) + { + fr.write(line); + fr.newLine(); + fr.flush(); + } + is.close(); + isr.close(); + br.close(); + fr.close(); + } + catch (IOException e) { + System.out.println(e); + runtime.exit(0); + } + } + + /* + * Learning model by CRF++ + */ + public void CRF_learn(String model, String FilenameData) throws IOException + { + Runtime runtime = Runtime.getRuntime(); + + Process process = null; + String line = null; + InputStream is = null; + InputStreamReader isr = null; + BufferedReader br = null; + + String OS=System.getProperty("os.name").toLowerCase(); + + String cmd="./CRF/crf_learn -f 3 -c 4.0 CRF/template_UB "+FilenameData+" "+model; + if(OS.contains("windows")) + { + cmd ="CRF/crf_learn -f 3 -c 4.0 CRF/template_UB "+FilenameData+" "+model; + } + else //if(OS.contains("nux")||OS.contains("nix")) + { + cmd ="./CRF/crf_learn -f 3 -c 4.0 CRF/template_UB "+FilenameData+" "+model; + } + + try { + process = runtime.exec(cmd); + is = process.getInputStream(); + isr = new InputStreamReader(is, "UTF-8"); + br = new BufferedReader(isr); + while ( (line = br.readLine()) != null) + { + System.out.println(line); + System.out.flush(); + } + is.close(); + isr.close(); + br.close(); + } + catch (IOException e) { + System.out.println(e); + runtime.exit(0); + } + } + + public void ReadCRFresult(String Filename,String FilenameLoca,String FilenameOutput,String FilenameBioC) throws XMLStreamException, IOException + { + /** load CRF output */ + ArrayList outputArr = new ArrayList(); + BufferedReader inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(FilenameOutput), "UTF-8")); + String line; + while ((line = inputfile.readLine()) != null) + { + outputArr.add(line); + } + inputfile.close(); + + /** load location */ + ArrayList locationArr = new ArrayList(); + inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(FilenameLoca), "UTF-8")); + while ((line = inputfile.readLine()) != null) + { + locationArr.add(line); + } + inputfile.close(); + + /** output -> mentions */ + String pmid_last=""; + String paragraph_num_last=""; + String pmid=""; + String paragraph=""; + String paragraph_num=""; + Pattern pat_B = Pattern.compile("((FamilyName|DomainMotif|Gene)_[B])$"); + Pattern pat_IE = Pattern.compile("((FamilyName|DomainMotif|Gene)_[IE])$"); + ArrayList> AnnotationInPMID = new ArrayList(); // array of Annotations in the PMIDs + ArrayList AnnotationInPassage= new ArrayList(); // array of Annotations in the Passage + GNormPlus.BioCDocobj.Annotations = new ArrayList(); + int countPMID=0; + int countPassage=0; + /** outputArr */ + for(int i=0;i3) + { + pmid=locationRow[0]; + paragraph=locationRow[1]; + paragraph_num=locationRow[2]; + } + + if( (!paragraph_num_last.equals("")) && (!paragraph_num.equals(paragraph_num_last)) ) + { + AnnotationInPMID.add(AnnotationInPassage); + AnnotationInPassage = new ArrayList(); + countPassage++; + } + if( (!pmid_last.equals("")) && (!pmid.equals(pmid_last)) ) + { + GNormPlus.BioCDocobj.Annotations.add(AnnotationInPMID); + AnnotationInPMID = new ArrayList(); + countPMID++; + countPassage=0; + } + + boolean F = false; //Flag of Finding + if(locationRow.length>2) + { + Matcher mat = pat_B.matcher(outputsRow[outputsRow.length-1]); // last column : Status + while(mat.find() && locationRow.length==6) + { + MentionType=mat.group(2); + pmid=locationRow[0]; + paragraph_num=locationRow[2]; + int start_tmp=Integer.parseInt(locationRow[4])-1; + int last_tmp=Integer.parseInt(locationRow[5]); + if(start_tmplast){last=last_tmp;} + i++; + F = true; + if(locationArr.get(i).length()>0) + { + outputsRow=outputArr.get(i).split("\\t"); + locationRow=locationArr.get(i).split("\\t"); + mat = pat_IE.matcher(outputsRow[outputsRow.length-1]); + } + else + { + break; + } + } + } + + if(F == true) + { + String PassageContext = GNormPlus.BioCDocobj.PassageContexts.get(countPMID).get(countPassage); // Passage context + String Mention = PassageContext.substring(start, last); + String Mention_nospace = Mention.replaceAll("[\\W\\-\\_]", ""); + if(Mention.toLowerCase().matches("(figure|tables|fig|tab|exp\\. [0-9]+).*")){} + else if(Mention.matches("[A-Z][A-Z]s")){} + else if(Mention.matches(".*\\|.*")){} + else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\;\\,\\'\\/\\\\].*")){} + else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\(].*") && !Mention.matches(".*[\\)].*")){} + else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\[].*") && !Mention.matches(".*[\\]].*")){} + else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\)].*") && !Mention.matches(".*[\\(].*")){} + else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\]].*") && !Mention.matches(".*[\\[].*")){} + else + { + AnnotationInPassage.add(start+"\t"+last+"\t"+Mention+"\t"+MentionType); + } + i--; + } + + paragraph_num_last=paragraph_num; + pmid_last=pmid; + }// outputArr1 + AnnotationInPMID.add(AnnotationInPassage); + GNormPlus.BioCDocobj.Annotations.add(AnnotationInPMID); + + //GNormPlus.BioCDocobj.BioCOutput(Filename,FilenameBioC,GNormPlus.BioCDocobj.Annotations,false,false); //save in BioC file + } + + public void ReadCRFresult(String Filename,String FilenameLoca,String FilenameOutput,String FilenameBioC,double threshold,double threshold_GeneType) throws XMLStreamException, IOException + { + /** load CRF output */ + ArrayList outputArr1 = new ArrayList(); + ArrayList outputArr2 = new ArrayList(); + ArrayList outputArr3 = new ArrayList(); + ArrayList outputArr1_score = new ArrayList(); + ArrayList outputArr2_score = new ArrayList(); + ArrayList outputArr3_score = new ArrayList(); + BufferedReader inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(FilenameOutput), "UTF-8")); + String line; + int rank=0; + String score=""; + Pattern pat_Rank = Pattern.compile("^# ([0-2]) ([0-9\\.]+)$"); + while ((line = inputfile.readLine()) != null) + { + Matcher mat = pat_Rank.matcher(line); // last column : Status + if(mat.find()) + { + rank = Integer.parseInt(mat.group(1)); + score = mat.group(2); + } + else if(rank == 0) + { + outputArr1.add(line); + outputArr1_score.add(score); + } + else if(rank == 1) + { + outputArr2.add(line); + outputArr2_score.add(score); + } + else if(rank == 2) + { + outputArr3.add(line); + outputArr3_score.add(score); + } + } + inputfile.close(); + + /** load location */ + ArrayList locationArr = new ArrayList(); + inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(FilenameLoca), "UTF-8")); + while ((line = inputfile.readLine()) != null) + { + locationArr.add(line); + } + inputfile.close(); + + /** output -> mentions */ + String pmid_last=""; + String paragraph_num_last=""; + String pmid=""; + String paragraph=""; + String paragraph_num=""; + Pattern pat_B = Pattern.compile("((FamilyName|DomainMotif|Gene)_[B])$"); + Pattern pat_IE = Pattern.compile("((FamilyName|DomainMotif|Gene)_[IE])$"); + ArrayList> AnnotationInPMID = new ArrayList(); // array of Annotations in the PMIDs + ArrayList AnnotationInPassage= new ArrayList(); // array of Annotations in the Passage + GNormPlus.BioCDocobj.Annotations = new ArrayList(); + int countPMID=0; + int countPassage=0; + /** outputArr1 */ + int size_Arr=outputArr1.size(); + if(locationArr.size()3) + { + pmid=locationRow[0]; + paragraph=locationRow[1]; + paragraph_num=locationRow[2]; + } + + boolean F = false; //Flag of Finding + if(outputsRow.length>=1) + { + Matcher mat = pat_B.matcher(outputsRow[outputsRow.length-1]); // last column : Status + while(mat.find() && locationRow.length==6) + { + MentionType=mat.group(2); + pmid=locationRow[0]; + int start_tmp=Integer.parseInt(locationRow[4])-1; + int last_tmp=Integer.parseInt(locationRow[5]); + if(start_tmplast){last=last_tmp;} + i++; + outputsRow=outputArr1.get(i).split("\\t"); + locationRow=locationArr.get(i).split("\\t"); + mat = pat_IE.matcher(outputsRow[outputsRow.length-1]); + F = true; + } + } + + if( (!paragraph_num_last.equals("")) && (!paragraph_num.equals(paragraph_num_last)) ) // paragraph change + { + AnnotationInPMID.add(AnnotationInPassage); + AnnotationInPassage = new ArrayList(); + countPassage++; + } + + if( !pmid.equals(pmid_last) && paragraph_num.equals("0") && paragraph_num_last.equals("0") ) // pmid change (special case : the article only has one paragrpah) + { + AnnotationInPMID.add(AnnotationInPassage); + AnnotationInPassage = new ArrayList(); + GNormPlus.BioCDocobj.Annotations.add(AnnotationInPMID); + AnnotationInPMID = new ArrayList(); + countPMID++; + countPassage=0; + } + else if( (!pmid_last.equals("")) && (!pmid.equals(pmid_last)) ) // pmid change + { + GNormPlus.BioCDocobj.Annotations.add(AnnotationInPMID); + AnnotationInPMID = new ArrayList(); + countPMID++; + countPassage=0; + } + + if(F == true) + { + if(GNormPlus.BioCDocobj.PassageContexts.size()>countPMID && GNormPlus.BioCDocobj.PassageContexts.get(countPMID).size()>countPassage && GNormPlus.BioCDocobj.PassageContexts.get(countPMID).get(countPassage).length()>=last && (last-start)<1000) + { + String PassageContext = GNormPlus.BioCDocobj.PassageContexts.get(countPMID).get(countPassage); // Passage context + String Mention = PassageContext.substring(start, last); + String Mention_nospace = Mention.replaceAll("[\\W\\-\\_]", ""); + if(Mention.toLowerCase().matches("(figure|tables|fig|tab|exp\\. [0-9]+).*")){} + else if(Mention.matches("[A-Z][A-Z]s")){} + else if(Mention.matches(".*\\|.*")){} + else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\;\\,\\'\\/\\\\].*")){} + else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\(].*") && !Mention.matches(".*[\\)].*")){} + else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\[].*") && !Mention.matches(".*[\\]].*")){} + else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\)].*") && !Mention.matches(".*[\\(].*")){} + else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\]].*") && !Mention.matches(".*[\\[].*")){} + else if((GNormPlus.Abb2Longformtok_hash.containsKey(Mention_nospace.toLowerCase())) && (PassageContext.toLowerCase().matches(".*[\\W\\-\\-]("+GNormPlus.Abb2Longformtok_hash.get(Mention_nospace.toLowerCase())+")[\\W\\-\\-].*"))) + { + //System.out.println(Mention_nospace.toLowerCase()+"\t"+GNormPlus.Abb2Longformtok_hash.get(Mention_nospace.toLowerCase())); + } + else + { + AnnotationInPassage.add(start+"\t"+last+"\t"+Mention+"\t"+MentionType); + } + } + i--; + } + paragraph_num_last=paragraph_num; + pmid_last=pmid; + }// outputArr1 + AnnotationInPMID.add(AnnotationInPassage); + GNormPlus.BioCDocobj.Annotations.add(AnnotationInPMID); + + /** outputArr2 */ + pmid_last=""; + paragraph_num_last=""; + pmid=""; + paragraph=""; + paragraph_num=""; + countPMID=0; + countPassage=0; + size_Arr=outputArr2.size(); + if(locationArr.size()2) + { + pmid=locationRow[0]; + paragraph=locationRow[1]; + paragraph_num=locationRow[2]; + } + + boolean F = false; //Flag of Finding + if(outputsRow.length>=1) + { + Matcher mat = pat_B.matcher(outputsRow[outputsRow.length-1]); // last column : Status + while(mat.find() && locationRow.length==6) + { + MentionType=mat.group(2); + pmid=locationRow[0]; + int start_tmp=Integer.parseInt(locationRow[4])-1; + int last_tmp=Integer.parseInt(locationRow[5]); + if(start_tmplast){last=last_tmp;} + i++; + outputsRow=outputArr2.get(i).split("\\t"); + locationRow=locationArr.get(i).split("\\t"); + mat = pat_IE.matcher(outputsRow[outputsRow.length-1]); + F = true; + } + } + + if( (!paragraph_num_last.equals("")) && (!paragraph_num.equals(paragraph_num_last)) ) // paragraph change + { + countPassage++; + } + + if( !pmid.equals(pmid_last) && paragraph_num.equals("0") && paragraph_num_last.equals("0") ) // pmid change (special case : the article only has one paragrpah) + { + countPMID++; + countPassage=0; + } + else if( (!pmid_last.equals("")) && (!pmid.equals(pmid_last)) ) // pmid change + { + countPMID++; + countPassage=0; + } + + if(F == true) + { + if(GNormPlus.BioCDocobj.PassageContexts.size()>countPMID && GNormPlus.BioCDocobj.PassageContexts.get(countPMID).size()>countPassage && GNormPlus.BioCDocobj.PassageContexts.get(countPMID).get(countPassage).length()>=last && (last-start)<1000) + { + String PassageContext = GNormPlus.BioCDocobj.PassageContexts.get(countPMID).get(countPassage); // Passage context + String Mention = PassageContext.substring(start, last); + String Mention_nospace = Mention.replaceAll("[\\W\\-\\_]", ""); + if(Mention.toLowerCase().matches("(figure|tables|fig|tab|exp\\. [0-9]+).*")){} + else if(Mention.matches("[A-Z][A-Z]s")){} + else if(Mention.matches(".*\\|.*")){} + else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\;\\,\\'\\/\\\\].*")){} + else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\(].*") && !Mention.matches(".*[\\)].*")){} + else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\[].*") && !Mention.matches(".*[\\]].*")){} + else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\)].*") && !Mention.matches(".*[\\(].*")){} + else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\]].*") && !Mention.matches(".*[\\[].*")){} + else if((GNormPlus.Abb2Longformtok_hash.containsKey(Mention_nospace.toLowerCase())) && (PassageContext.toLowerCase().matches(".*[\\W\\-\\-]("+GNormPlus.Abb2Longformtok_hash.get(Mention_nospace.toLowerCase())+")[\\W\\-\\-].*"))) + { + //System.out.println(Mention_nospace.toLowerCase()+"\t"+GNormPlus.Abb2Longformtok_hash.get(Mention_nospace.toLowerCase())); + } + else if(Double.parseDouble(outputArr2_score.get(i))>threshold) + { + boolean overlap=false; + for(int j=0;jthreshold_GeneType && GNormPlus.BioCDocobj.Annotations.get(countPMID).get(countPassage).get(j).matches(start+"\t"+last+"\t"+Mention_tmp+"\t(FamilyName|DomainMotif)") ) + { + GNormPlus.BioCDocobj.Annotations.get(countPMID).get(countPassage).set(j, start+"\t"+last+"\t"+Mention+"\t"+MentionType); + } + else if( (start>=startj && startstartj && last<=lastj) ) + { + overlap=true; + } + } + if(overlap == false) + { + GNormPlus.BioCDocobj.Annotations.get(countPMID).get(countPassage).add(start+"\t"+last+"\t"+Mention+"\t"+MentionType); + } + } + } + i--; + } + + paragraph_num_last=paragraph_num; + pmid_last=pmid; + }// outputArr2 + + /** outputArr3 */ + pmid_last=""; + paragraph_num_last=""; + pmid=""; + paragraph=""; + paragraph_num=""; + countPMID=0; + countPassage=0; + size_Arr=outputArr3.size(); + if(locationArr.size()2) + { + pmid=locationRow[0]; + paragraph=locationRow[1]; + paragraph_num=locationRow[2]; + } + + boolean F = false; //Flag of Finding + if(outputsRow.length>=1) + { + Matcher mat = pat_B.matcher(outputsRow[outputsRow.length-1]); // last column : Status + while(mat.find() && locationRow.length==6) + { + MentionType=mat.group(2); + pmid=locationRow[0]; + paragraph_num=locationRow[2]; + int start_tmp=Integer.parseInt(locationRow[4])-1; + int last_tmp=Integer.parseInt(locationRow[5]); + if(start_tmplast){last=last_tmp;} + i++; + outputsRow=outputArr3.get(i).split("\\t"); + locationRow=locationArr.get(i).split("\\t"); + mat = pat_IE.matcher(outputsRow[outputsRow.length-1]); + F = true; + } + } + + if( (!paragraph_num_last.equals("")) && (!paragraph_num.equals(paragraph_num_last)) ) // paragraph change + { + countPassage++; + } + + if( !pmid.equals(pmid_last) && paragraph_num.equals("0") && paragraph_num_last.equals("0") ) // pmid change (special case : the article only has one paragrpah) + { + countPMID++; + countPassage=0; + } + else if( (!pmid_last.equals("")) && (!pmid.equals(pmid_last)) ) // pmid change + { + countPMID++; + countPassage=0; + } + + if(F == true) + { + if(GNormPlus.BioCDocobj.PassageContexts.size()>countPMID && GNormPlus.BioCDocobj.PassageContexts.get(countPMID).size()>countPassage && GNormPlus.BioCDocobj.PassageContexts.get(countPMID).get(countPassage).length()>=last && (last-start)<1000) + { + String PassageContext = GNormPlus.BioCDocobj.PassageContexts.get(countPMID).get(countPassage); // Passage context + String Mention = PassageContext.substring(start, last); + String Mention_nospace = Mention.replaceAll("[\\W\\-\\_]", ""); + if(Mention.toLowerCase().matches("(figure|tables|fig|tab|exp\\. [0-9]+).*")){} + else if(Mention.matches("[A-Z][A-Z]s")){} + else if(Mention.matches(".*\\|.*")){} + else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\;\\,\\'\\/\\\\].*")){} + else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\(].*") && !Mention.matches(".*[\\)].*")){} + else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\[].*") && !Mention.matches(".*[\\]].*")){} + else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\)].*") && !Mention.matches(".*[\\(].*")){} + else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\]].*") && !Mention.matches(".*[\\[].*")){} + else if((GNormPlus.Abb2Longformtok_hash.containsKey(Mention_nospace.toLowerCase())) && (PassageContext.toLowerCase().matches(".*[\\W\\-\\-]("+GNormPlus.Abb2Longformtok_hash.get(Mention_nospace.toLowerCase())+")[\\W\\-\\-].*"))) + { + //System.out.println(Mention_nospace.toLowerCase()+"\t"+GNormPlus.Abb2Longformtok_hash.get(Mention_nospace.toLowerCase())); + } + else if(Double.parseDouble(outputArr3_score.get(i))>threshold) + { + boolean overlap=false; + for(int j=0;jthreshold_GeneType && GNormPlus.BioCDocobj.Annotations.get(countPMID).get(countPassage).get(j).matches(start+"\t"+last+"\t"+Mention_tmp+"\t(FamilyName|DomainMotif)") ) + { + GNormPlus.BioCDocobj.Annotations.get(countPMID).get(countPassage).set(j, start+"\t"+last+"\t"+Mention+"\t"+MentionType); + } + else if( (start>=startj && startstartj && last<=lastj) ) + { + overlap=true; + } + } + if(overlap == false) + { + GNormPlus.BioCDocobj.Annotations.get(countPMID).get(countPassage).add(start+"\t"+last+"\t"+Mention+"\t"+MentionType); + } + } + } + i--; + } + + paragraph_num_last=paragraph_num; + pmid_last=pmid; + }// outputArr3 + + //GNormPlus.BioCDocobj.BioCOutput(Filename,FilenameBioC,GNormPlus.BioCDocobj.Annotations,false,false); //save in BioC file + } + + public void PostProcessing(String Filename,String FilenameBioC) throws XMLStreamException, IOException + { + /** Develop Cell | FamilyName | DomainMotif lists */ + String Disease_Suffix="disease|diseases|syndrome|syndromes|tumor|tumour|deficiency|dysgenesis|atrophy|frame|dystrophy"; + String Cell_Suffix="cell|cells"; + String FamilyName_Suffix="disease|diseases|syndrome|syndromes|tumor|tumour|deficiency|dysgenesis|atrophy|frame|dystrophy|frame|factors|family|families|superfamily|superfamilies|subfamily|subfamilies|complex|genes|proteins"; + String DomainMotif_Suffix="domain|motif|domains|motifs|sequences"; + String Strain_Suffix="alpha|beta|gamma|kappa|theta|delta|[A-Ga-g0-9]"; + ArrayList Translate2Family = new ArrayList(); + + for(int i=0;i Mention2Type_Hash = new HashMap(); // for substring detection - Extract all mentions in the target PMID : MentionList + ArrayList GeneMentionPattern = new ArrayList(); // pattern match to extend Gene + HashMap MentionType2Num = new HashMap(); // for frequency calculation + if(GNormPlus.BioCDocobj.PMIDs.size()>=i) + { + String pmid=GNormPlus.BioCDocobj.PMIDs.get(i); + for(int j=0;j RemoveList = new ArrayList(); + for(int k=0;k Family name (TIF & TIF1) */ + boolean SubSt=false; + /* + // GDNFb -> GDNF (not work on 12682085_J_Cell_Biol_2003.xml) + for (String men : Mention2Type_Hash.keySet()) + { + if((!men.equals(mention.toLowerCase())) && men.matches(mention_tmp+"[\\W\\-\\_]*("+Strain_Suffix+")")) + { + GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, start+"\t"+last+"\t"+mention+"\tFamilyName"); + if(GNormPlus.PmidLF2Abb_lc_hash.containsKey(GNormPlus.BioCDocobj.PMIDs.get(i)+"\t"+mention.toLowerCase())) + { + Translate2Family.add(GNormPlus.PmidLF2Abb_lc_hash.get(GNormPlus.BioCDocobj.PMIDs.get(i)+"\t"+mention.toLowerCase())); + } + else if(GNormPlus.PmidAbb2LF_lc_hash.containsKey(GNormPlus.BioCDocobj.PMIDs.get(i)+"\t"+mention.toLowerCase())) + { + Translate2Family.add(GNormPlus.PmidAbb2LF_lc_hash.get(GNormPlus.BioCDocobj.PMIDs.get(i)+"\t"+mention.toLowerCase())); + } + SubSt=true; + break; + } + } + */ + if(SubSt == false) + { + int BoundaryLen=15; + if(GNormPlus.BioCDocobj.PassageContexts.get(i).get(j).length() Family/Domain/Cell */ + if( mention.toLowerCase().matches(".*("+Cell_Suffix+")") || SurroundingString.matches("("+Cell_Suffix+")") ) + { + type="Cell"; + GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, start+"\t"+last+"\t"+mention+"\t"+type); + } + else if( mention.toLowerCase().matches(".*("+FamilyName_Suffix+")") || SurroundingString.matches("("+FamilyName_Suffix+")") ) + { + type="FamilyName"; + GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, start+"\t"+last+"\t"+mention+"\t"+type); + } + else if( mention.toLowerCase().matches(".*("+DomainMotif_Suffix+")")|| SurroundingString.matches("("+DomainMotif_Suffix+")") ) + { + type="DomainMotif"; + GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, start+"\t"+last+"\t"+mention+"\t"+type); + } + else if(!type.equals("Gene")) + { + /* 3. Check (Family+Domain+Cell)/All rate (threshold = 0.5) - Family/Domain/Cell -> Gene */ + double Num_FDC=0; + double Num_Gene=0; + if(MentionType2Num.containsKey(mention.toLowerCase()+"\tFamilyName")) + { + Num_FDC = Num_FDC + MentionType2Num.get(mention.toLowerCase()+"\tFamilyName"); + } + if(MentionType2Num.containsKey(mention.toLowerCase()+"\tDomainMotif")) + { + Num_FDC = Num_FDC + MentionType2Num.get(mention.toLowerCase()+"\tDomainMotif"); + } + if(MentionType2Num.containsKey(mention.toLowerCase()+"\tCell")) + { + Num_FDC = Num_FDC + MentionType2Num.get(mention.toLowerCase()+"\tCell"); + } + if(MentionType2Num.containsKey(mention.toLowerCase()+"\tGene")) + { + Num_Gene = Num_Gene + MentionType2Num.get(mention.toLowerCase()+"\tGene"); + } + if(Num_Gene/(Num_FDC+Num_Gene)>=0.5) + { + GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, start+"\t"+last+"\t"+mention+"\tGene"); + } + + /* 4. Extend Genes to Family/Domain mentions by pattern match - Family/Domain/Cell -> Gene */ + for(int p=0;p Abb.type + * - Abb only : Abb.type -> LF.type + * - LF only : LF.type -> Abb.type + */ + String lc_ment=mention.toLowerCase(); + if(GNormPlus.PmidAbb2LF_lc_hash.containsKey(pmid+"\t"+lc_ment)) //the target mention is abbreviation + { + //Infer Abbreviation by Long form + if(GNormPlus.PmidAbb2LF_lc_hash.get(pmid+"\t"+lc_ment).matches(".*("+Disease_Suffix+")")) + { + //remove the mention (Abb), because the LF is a disease + } + else if(GNormPlus.PmidAbb2LF_lc_hash.get(pmid+"\t"+lc_ment).matches(".*("+Cell_Suffix+")")) + { + //GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, Anno[0]+"\t"+Anno[1]+"\tCell"); + } + else if(GNormPlus.PmidAbb2LF_lc_hash.get(pmid+"\t"+lc_ment).matches(".*("+FamilyName_Suffix+")") && !lc_ment.matches(".+[a-z][0-9][a-z]")) //AtRPA1a in pmid:19153602 + { + GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, start+"\t"+last+"\t"+mention+"\tFamilyName"); + } + else if(GNormPlus.PmidAbb2LF_lc_hash.get(pmid+"\t"+lc_ment).matches(".*("+DomainMotif_Suffix+")")) + { + GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, start+"\t"+last+"\t"+mention+"\tDomainMotif"); + } + else + { + if(Mention2Type_Hash.containsKey(GNormPlus.PmidAbb2LF_lc_hash.get(pmid+"\t"+lc_ment)) + && Mention2Type_Hash.get(GNormPlus.PmidAbb2LF_lc_hash.get(pmid+"\t"+lc_ment)).equals("Gene") + && !(type.equals("Gene")) + ) // if Long Form is recognized as a Gene, and Abb is recognized as not a Gene + { + GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, start+"\t"+last+"\t"+mention+"\tGene"); + } + } + } + } //if(Remov == true) + } + } + + for(int j=0;j GeneMentionPattern = new ArrayList(); // pattern match to extend Gene + HashMap GeneMentions = new HashMap(); // Extending Gene mentions + HashMap GeneMentionLocationGNR = new HashMap(); // Extending Gene mentions + for(int j=0;ji && GNormPlus.BioCDocobj.PassageContexts.get(i).size()>j) + { + String PassageContexts = " " + GNormPlus.BioCDocobj.PassageContexts.get(i).get(j) + " "; + String PassageContexts_tmp = PassageContexts.toLowerCase(); + for(String gm : GeneMentions.keySet()) + { + String type=GeneMentions.get(gm); + if(type.equals("Gene")) + { + gm = gm.replaceAll("([\\W\\-\\_])", "\\\\$1"); + gm=gm.replaceAll("[0-9]", "\\[0\\-9\\]"); + gm=gm.replaceAll("(alpha|beta|gamma|theta|zeta|delta)", "(alpha\\|beta\\|gamma\\|theta\\|zeta\\|delta)"); + gm=gm.replaceAll("\\-[a-z]$", "\\-\\[a\\-z\\]"); + Pattern ptmp = Pattern.compile("^(.*[\\W\\-\\_])("+gm+")([\\W\\-\\_].*)$"); + Matcher mtmp = ptmp.matcher(PassageContexts_tmp); + while(mtmp.find()) + { + String pre = mtmp.group(1); + String gmtmp = mtmp.group(2); + String post = mtmp.group(3); + + int start = pre.length()-1; + int last = start+gmtmp.length(); + if(PassageContexts.length()>last) + { + String mention = PassageContexts.substring(start+1,last+1); + if(!GeneMentionLocationGNR.containsKey(j+"\t"+start) && !GeneMentionLocationGNR.containsKey(j+"\t"+last)) + { + if(GNormPlus.BioCDocobj.Annotations.get(i).get(j).contains(start+"\t"+last+"\t"+mention+"\tFamilyName")) + { + GNormPlus.BioCDocobj.Annotations.get(i).get(j).remove(start+"\t"+last+"\t"+mention+"\tFamilyName"); + } + else if(GNormPlus.BioCDocobj.Annotations.get(i).get(j).contains(start+"\t"+last+"\t"+mention+"\tDomainMotif")) + { + GNormPlus.BioCDocobj.Annotations.get(i).get(j).remove(start+"\t"+last+"\t"+mention+"\tDomainMotif"); + } + GNormPlus.BioCDocobj.Annotations.get(i).get(j).add(start+"\t"+last+"\t"+mention+"\tGene"); + } + gmtmp = gmtmp.replaceAll(".", "X"); + PassageContexts_tmp=pre+""+gmtmp+""+post; + mtmp = ptmp.matcher(PassageContexts_tmp); + } + } + } + } + } + } + + //Extend to all family mentions + for(int j=0;ji && GNormPlus.BioCDocobj.PassageContexts.get(i).size()>j) + { + String PassageContexts = " " + GNormPlus.BioCDocobj.PassageContexts.get(i).get(j) + " "; + String PassageContexts_tmp = PassageContexts.toLowerCase(); + for(String gm : GeneMentions.keySet()) + { + String type=GeneMentions.get(gm); + if(type.matches("(FamilyName|DomainMotif)")) + { + gm = gm.replaceAll("([\\W\\-\\_])", "\\\\$1"); + gm=gm.replaceAll("s$", "(s\\|)"); + Pattern ptmp = Pattern.compile("^(.*[\\W\\-\\_])("+gm+")([\\W\\-\\_].*)$"); + Matcher mtmp = ptmp.matcher(PassageContexts_tmp); + while(mtmp.find()) + { + String pre = mtmp.group(1); + String gmtmp = mtmp.group(2); + String post = mtmp.group(3); + + int start = pre.length()-1; + int last = start+gmtmp.length(); + if(PassageContexts.length()>last) + { + String mention = PassageContexts.substring(start+1,last+1); + if(!GeneMentionLocationGNR.containsKey(j+"\t"+start) && !GeneMentionLocationGNR.containsKey(j+"\t"+last)) + { + if(!GNormPlus.BioCDocobj.Annotations.get(i).get(j).contains(start+"\t"+last+"\t"+mention+"\tGene")) + { + GNormPlus.BioCDocobj.Annotations.get(i).get(j).add(start+"\t"+last+"\t"+mention+"\t"+type); + } + } + gmtmp = gmtmp.replaceAll(".", "X"); + PassageContexts_tmp=pre+""+gmtmp+""+post; + mtmp = ptmp.matcher(PassageContexts_tmp); + } + } + } + } + } + } + } + } + GNormPlus.BioCDocobj.BioCOutput(Filename,FilenameBioC,GNormPlus.BioCDocobj.Annotations,false,false); //save in BioC file + } +} + +