/** * Project: GNormPlus * Function: Gene Name Recognition */ package GNormPluslib; import java.io.*; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.xml.stream.XMLStreamException; import org.tartarus.snowball.SnowballStemmer; import org.tartarus.snowball.ext.englishStemmer; import GNormPluslib.GNormPlus; import GNormPluslib.BioCDoc; public class GNR { /* * Read BioC files */ public void Ab3P(String Filename,String FilenameAbb,String TrainTest) throws XMLStreamException,IOException { /** Abbreviation*/ //BioC -> Abb input String line=""; BufferedWriter FileAbb = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(FilenameAbb), "UTF-8")); for (int i = 0; i < GNormPlus.BioCDocobj.PMIDs.size(); i++) { String Pmid = GNormPlus.BioCDocobj.PMIDs.get(i); String Context=""; for (int j = 0; j < GNormPlus.BioCDocobj.PassageNames.get(i).size(); j++) { String PassageContext=GNormPlus.BioCDocobj.PassageContexts.get(i).get(j); if(PassageContext.matches(".*\\([^\\(\\)]+,[^\\(\\)]+\\).*")) { PassageContext=PassageContext.replaceAll("\\([^\\(\\)]+,[^\\(\\)]+\\)", ""); } if(PassageContext.contains("\\(")) { Context = Context+PassageContext+" "; } } FileAbb.write(Pmid+"\n"+Context+"\n\n"); } FileAbb.close(); //Abb File f = new File(FilenameAbb+".out"); BufferedWriter fr = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(f), "UTF-8")); Runtime runtime = Runtime.getRuntime(); String cmd ="./Ab3P "+FilenameAbb+".Abb "+FilenameAbb+".out"; String OS=System.getProperty("os.name").toLowerCase(); if(OS.contains("windows")) { cmd ="java -jar bioadi.jar "+FilenameAbb; } else //if(OS.contains("nux")||OS.contains("nix")) { cmd ="./Ab3P "+FilenameAbb+" "+FilenameAbb+".out"; //cmd ="java -jar bioadi.jar "+FilenameAbb+" > "+FilenameAbb+".out"; } Process process = runtime.exec(cmd); InputStream is = process.getInputStream(); InputStreamReader isr = new InputStreamReader(is, "UTF-8"); BufferedReader br = new BufferedReader(isr); line=""; while ( (line = br.readLine()) != null) { fr.write(line); fr.newLine(); fr.flush(); } is.close(); isr.close(); br.close(); fr.close(); //Abb output -> Hash BufferedReader inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(FilenameAbb+".out"), "UTF-8")); line=""; String pmid=""; while ((line = inputfile.readLine()) != null) { String patt="^ (.+)\\|(.+)\\|([0-9\\.]+)$"; Pattern ptmp = Pattern.compile(patt); Matcher mtmp = ptmp.matcher(line); if(line.matches("^[0-9]+$")) { pmid=line; } if(mtmp.find()) { String SF = mtmp.group(1); String LF = mtmp.group(2); double weight= Double.parseDouble(mtmp.group(3)); GNormPlus.Pmid2Abb_hash.put(pmid+"\t"+SF, "Abb:SF"); GNormPlus.Pmid2Abb_hash.put(pmid+"\t"+LF, "Abb:LF"); GNormPlus.PmidLF2Abb_lc_hash.put(pmid+"\t"+LF.toLowerCase(), SF.toLowerCase()); GNormPlus.PmidAbb2LF_lc_hash.put(pmid+"\t"+SF.toLowerCase(), LF.toLowerCase()); GNormPlus.PmidAbb2LF_hash.put(pmid+"\t"+SF, LF); if(weight >= 0.9) { GNormPlus.PmidLF2Abb_hash.put(pmid+"\t"+LF, SF); } } } inputfile.close(); } public void LoadInputFile(String Filename,String FilenameAbb,String TrainTest) throws XMLStreamException,IOException { /** Read BioC file */ //if(TrainTest.equals("Train")) //{ GNormPlus.BioCDocobj.BioCReaderWithAnnotation(Filename); //} //else //{ // GNormPlus.BioCDocobj.BioCReader(Filename); //} /** Abbreviation*/ //BioC -> Abb input String line=""; BufferedWriter FileAbb = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(FilenameAbb), "UTF-8")); for (int i = 0; i < GNormPlus.BioCDocobj.PMIDs.size(); i++) { String Pmid = GNormPlus.BioCDocobj.PMIDs.get(i); String Context="Text:"; for (int j = 0; j < GNormPlus.BioCDocobj.PassageNames.get(i).size(); j++) { String PassageContext=GNormPlus.BioCDocobj.PassageContexts.get(i).get(j); if(PassageContext.matches(".*\\([^\\(\\)]+,[^\\(\\)]+\\).*")) { PassageContext=PassageContext.replaceAll("\\([^\\(\\)]+,[^\\(\\)]+\\)", ""); } if(PassageContext.contains("(")) { Context = Context+PassageContext+" "; } } FileAbb.write(Pmid+"\n"+Context+"\n\n"); } FileAbb.close(); //Abb File f = new File(FilenameAbb+".out"); BufferedWriter fr = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(f), "UTF-8")); Runtime runtime = Runtime.getRuntime(); String cmd ="./Ab3P "+FilenameAbb+".Abb "+FilenameAbb+".out"; String OS=System.getProperty("os.name").toLowerCase(); if(OS.contains("windows")) { cmd ="java -jar bioadi.jar "+FilenameAbb; } else //if(OS.contains("nux")||OS.contains("nix")) { cmd ="./Ab3P "+FilenameAbb+" "+FilenameAbb+".out"; //cmd ="java -jar bioadi.jar "+FilenameAbb+" > "+FilenameAbb+".out"; } Process process = runtime.exec(cmd); InputStream is = process.getInputStream(); InputStreamReader isr = new InputStreamReader(is, "UTF-8"); BufferedReader br = new BufferedReader(isr); line=""; while ( (line = br.readLine()) != null) { fr.write(line); fr.newLine(); fr.flush(); } is.close(); isr.close(); br.close(); fr.close(); //Abb output -> Hash BufferedReader inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(FilenameAbb+".out"), "UTF-8")); line=""; String pmid=""; while ((line = inputfile.readLine()) != null) { String patt="^ (.+)\\|(.+)\\|([0-9\\.]+)$"; Pattern ptmp = Pattern.compile(patt); Matcher mtmp = ptmp.matcher(line); if(line.matches("^[0-9]+$")) { pmid=line; } if(mtmp.find()) { String SF = mtmp.group(1); String LF = mtmp.group(2); double weight= Double.parseDouble(mtmp.group(3)); GNormPlus.Pmid2Abb_hash.put(pmid+"\t"+SF, "Abb:SF"); GNormPlus.Pmid2Abb_hash.put(pmid+"\t"+LF, "Abb:LF"); GNormPlus.PmidLF2Abb_lc_hash.put(pmid+"\t"+LF.toLowerCase(), SF.toLowerCase()); GNormPlus.PmidAbb2LF_lc_hash.put(pmid+"\t"+SF.toLowerCase(), LF.toLowerCase()); GNormPlus.PmidAbb2LF_hash.put(pmid+"\t"+SF, LF); if(weight >= 0.9) { GNormPlus.PmidLF2Abb_hash.put(pmid+"\t"+LF, SF); } } } inputfile.close(); } /* * Feature Extraction */ public void FeatureExtraction(String FilenameData,String FilenameLoca,String TrainTest) throws XMLStreamException { try { /** output files */ BufferedWriter FileLocation = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(FilenameLoca), "UTF-8")); // .location BufferedWriter FileData = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(FilenameData), "UTF-8")); // .data //NLP modules SnowballStemmer stemmer = new englishStemmer(); /** PMIDs : i */ for (int i = 0; i < GNormPlus.BioCDocobj.PMIDs.size(); i++) { String Pmid = GNormPlus.BioCDocobj.PMIDs.get(i); /** Paragraphs : j */ for (int j = 0; j < GNormPlus.BioCDocobj.PassageNames.get(i).size(); j++) { String PassageName= GNormPlus.BioCDocobj.PassageNames.get(i).get(j); // Passage name int PassageOffset = GNormPlus.BioCDocobj.PassageOffsets.get(i).get(j); // Passage offset String PassageContext = GNormPlus.BioCDocobj.PassageContexts.get(i).get(j); // Passage context ArrayList Annotation = GNormPlus.BioCDocobj.Annotations.get(i).get(j); // Annotation HashMap CTDGene_hash = new HashMap(); HashMap FamilyName_hash = new HashMap(); HashMap character_hash = new HashMap(); HashMap Abbreviation_hash = new HashMap(); String PassageContext_tmp=" "+PassageContext+" "; /** Abbreviation */ HashMap Abb_sortebylength = new HashMap(); ArrayList length_list = new ArrayList(); int countn=0; for (Object key : GNormPlus.Pmid2Abb_hash.keySet()) { String pmid2abb[]=key.toString().split("\t"); if(Pmid.equals(pmid2abb[0])) { Abb_sortebylength.put(pmid2abb[1].length()*100+countn, pmid2abb[1]); length_list.add(pmid2abb[1].length()*100+countn); countn++; } } Collections.sort(length_list); for (int l=length_list.size()-1;l>=0;l--) { String AbbLF = Abb_sortebylength.get(length_list.get(l)); AbbLF=AbbLF.replaceAll("([^A-Za-z0-9@ ])", "\\\\$1"); AbbLF=AbbLF.replaceAll(" ", "\\[ \\]\\+"); Pattern ptmp = Pattern.compile("^(.*[^A-Za-z0-9]+)("+AbbLF+")([^A-Za-z0-9]+.*)$"); Matcher mtmp = ptmp.matcher(PassageContext_tmp); while(mtmp.find()) { String str1=mtmp.group(1); String str2=mtmp.group(2); String str3=mtmp.group(3); for(int m=str1.length();m<=(str1.length()+str2.length());m++) { Abbreviation_hash.put((m-1),GNormPlus.Pmid2Abb_hash.get(Pmid+"\t"+Abb_sortebylength.get(length_list.get(l)))); } String men=""; for(int m=0;m locations = GNormPlus.PT_CTDGene.SearchMentionLocation(PassageContext,"CTDGene"); for (int k = 0 ; k < locations.size() ; k++) { String anno[]=locations.get(k).split("\t"); int start= Integer.parseInt(anno[0]) + PassageOffset; int last= Integer.parseInt(anno[1]) + PassageOffset; String mention = anno[2]; String id = anno[3]; CTDGene_hash.put(start,"CTDGene_B"); CTDGene_hash.put(last,"CTDGene_E"); for(int s=start+1;s locations_Fname = GNormPlus.PT_FamilyName.SearchMentionLocation(PassageContext,"FamilyName"); for (int k = 0 ; k < locations_Fname.size() ; k++) { String anno[]=locations_Fname.get(k).split("\t"); int start= Integer.parseInt(anno[0]) + PassageOffset; int last= Integer.parseInt(anno[1]) + PassageOffset; String mention = anno[2]; String id = anno[3]; if(!CTDGene_hash.containsKey(start)) { FamilyName_hash.put(start,"famplex_B"); FamilyName_hash.put(last,"famplex_E"); for(int s=start+1;stokens[p].length() && PassageContext_tmp.substring(tokens[p].length(),tokens[p].length()+1).equals(" ")) { WSF="WSF:Gap"; } if(p==0) { WSB="WSB:1st"; } else if(p==tokens.length-1) { WSF="WSF:last"; } if(PassageContext_tmp.substring(0,tokens[p].length()).equals(tokens[p])) { if(tokens[p].length()>0) { /* * .loca */ int start=Offset; int last=Offset+tokens[p].length(); String State=""; if(!character_hash.containsKey(start) || !character_hash.containsKey(last)){} else if(character_hash.get(start).matches(".*B$")) { State=character_hash.get(start); } else if(character_hash.get(last).matches(".*E$")) { State=character_hash.get(last); } else if(character_hash.get(start).matches(".*I$")) { State=character_hash.get(start); } if((!tokens[p].equals("\t"))) { FileLocation.write(Pmid+"\t"+PassageName+"\t"+j+"\t"+tokens[p]+"\t"+(Offset+1)+"\t"+(Offset+tokens[p].length())+"\t"+State+"\n"); } /* * .data */ //Abbreviation String Abb_State="__nil__"; if(!Abbreviation_hash.containsKey(start) || !Abbreviation_hash.containsKey(last)){Abb_State="__nil__";} else if(Abbreviation_hash.containsKey(start)) { Abb_State=Abbreviation_hash.get(start); } //CTDGene start=PassageOffset+Offset; last=PassageOffset+Offset+tokens[p].length(); String CTDGene_State="__nil__"; if(!CTDGene_hash.containsKey(start) || !CTDGene_hash.containsKey(last)){CTDGene_State="__nil__";} else if(CTDGene_hash.get(start).matches(".*B$")) { CTDGene_State=CTDGene_hash.get(start); } else if(CTDGene_hash.get(last).matches(".*E$")) { CTDGene_State=CTDGene_hash.get(last); } else if(CTDGene_hash.get(start).matches(".*I$")) { CTDGene_State=CTDGene_hash.get(start); } //FamilyName if(CTDGene_State.equals("__nil__")) { start=PassageOffset+Offset; last=PassageOffset+Offset+tokens[p].length(); if(!FamilyName_hash.containsKey(start) || !FamilyName_hash.containsKey(last)){} else if(FamilyName_hash.get(start).matches(".*B$")) { CTDGene_State=FamilyName_hash.get(start); } else if(FamilyName_hash.get(last).matches(".*E$")) { CTDGene_State=FamilyName_hash.get(last); } else if(FamilyName_hash.get(start).matches(".*I$")) { CTDGene_State=FamilyName_hash.get(start); } } //stemming stemmer.setCurrent(tokens[p].toLowerCase()); stemmer.stem(); String stem=stemmer.getCurrent(); //Number of Numbers [0-9] String Num_num=""; String tmp=tokens[p]; tmp=tmp.replaceAll("[^0-9]",""); if(tmp.length()>3){Num_num="N:4+";}else{Num_num="N:"+ tmp.length();} //Number of Uppercase [A-Z] String Num_Uc=""; tmp=tokens[p]; tmp=tmp.replaceAll("[^A-Z]",""); if(tmp.length()>3){Num_Uc="U:4+";}else{Num_Uc="U:"+ tmp.length();} //Number of Lowercase [a-z] String Num_lc=""; tmp=tokens[p]; tmp=tmp.replaceAll("[^a-z]",""); if(tmp.length()>3){Num_lc="L:4+";}else{Num_lc="L:"+ tmp.length();} //Number of ALL char String Num_All=""; if(tokens[p].length()>3){Num_All="A:4+";}else{Num_All="A:"+ tokens[p].length();} //specific character (;:,.->+_) String SpecificC="__nil__"; if(tokens[p].equals(";") || tokens[p].equals(":") || tokens[p].equals(",") || tokens[p].equals(".") || tokens[p].equals("-") || tokens[p].equals(">") || tokens[p].equals("+") || tokens[p].equals("_")) { SpecificC="-SpecificC1-"; } else if(tokens[p].equals("(") || tokens[p].equals(")")) { SpecificC="-SpecificC2-"; } else if(tokens[p].equals("{") || tokens[p].equals("}")) { SpecificC="-SpecificC3-"; } else if(tokens[p].equals("[") || tokens[p].equals("]")) { SpecificC="-SpecificC4-"; } else if(tokens[p].equals("\\") || tokens[p].equals("/")) { SpecificC="-SpecificC5-"; } //Chemical Prefix/Suffix String ChemPreSuf="__nil__"; if(tokens[p].matches(".*(yl|ylidyne|oyl|sulfonyl)")){ChemPreSuf="-CHEMinlineSuffix-";} else if(tokens[p].matches("(meth|eth|prop|tetracos).*")){ChemPreSuf="-CHEMalkaneStem-";} else if(tokens[p].matches("(di|tri|tetra).*")){ChemPreSuf="-CHEMsimpleMultiplier-";} else if(tokens[p].matches("(benzen|pyridin|toluen).*")){ChemPreSuf="-CHEMtrivialRing-";} else if(tokens[p].matches(".*(one|ol|carboxylic|amide|ate|acid|ium|ylium|ide|uide|iran|olan|inan|pyrid|acrid|amid|keten|formazan|fydrazin)(s|)")){ChemPreSuf="-CHEMsuffix-";} //Mention Type String MentionType="__nil__"; /* if($tmp eq "to" && $CTD_result_hash{$count_token-1} eq "CTD_gene" && $CTD_result_hash{$count_token+1} eq "CTD_gene"){$CTD_result_hash{$count_token}="CTD_gene";} if($tmp=~/^(or|and|,)$/ && $CTD_result_hash{$count_token-1} eq "CTD_gene" && $CTD_result_hash{$count_token+1} eq "CTD_gene"){$MentionType="-Type_GeneConjunction-";} elsif($tmp=~/^(or|and|,)$/ && $last_token=~/^(or|and|,)$/ && $CTD_result_hash{$count_token-2} eq "CTD_gene" && $CTD_result_hash{$count_token+1} eq "CTD_gene"){$MentionType="-Type_GeneConjunction-";} elsif($tmp=~/^(or|and|,)$/ && $next_token=~/^(or|and|,)$/ && $CTD_result_hash{$count_token-1} eq "CTD_gene" && $CTD_result_hash{$count_token+2} eq "CTD_gene"){$MentionType="-Type_GeneConjunction-";} */ if(tokens[p].matches("(ytochrome|cytochrome)")){MentionType="-Type_cytochrome-";} else if(tokens[p].matches(".*target") ){MentionType="-Type_target-";} else if(tokens[p].matches(".*(irradiation|hybrid|fusion|experiment|gst|est|gap|antigen)") ){MentionType="-Type_ExperimentNoun-";} else if(tokens[p].matches(".*(disease|disorder|dystrophy|deficiency|syndrome|dysgenesis|cancer|injury|neoplasm|diabetes|diabete)") ){MentionType="-Type_Disease-";} else if(tokens[p].matches(".*(motif|domain|omain|binding|site|region|sequence|frameshift|finger|box).*") ){MentionType="-Type_DomainMotif-";} else if(tokens[p].equals("-") && (p0 && tokens[p-1].matches("^[0-9]+$")) ) ){MentionType="-Type_ChromosomeStrain-";} else if(tokens[p].matches(".*(related|regulated|associated|correlated|reactive).*")){MentionType="-Type_relation-";} else if(tokens[p].toLowerCase().matches(".*(polymorphism|mutation|deletion|insertion|duplication|genotype|genotypes).*") ){MentionType="-Type_VariationTerms-";} else if(tokens[p].matches(".*(oxidase|transferase|transferases|kinase|kinese|subunit|unit|receptor|adrenoceptor|transporter|regulator|transcription|antigen|protein|gene|factor|member|molecule|channel|deaminase|spectrin).*") ){MentionType="-Type_suffix-";} else if(tokens[p].matches("[\\(\\-\\_]") && (p=1){ prefix=tmp.substring(0, 1);}else{prefix="__nil__";} if(tmp.length()>=2){ prefix=prefix+" "+tmp.substring(0, 2);}else{prefix=prefix+" __nil__";} if(tmp.length()>=3){ prefix=prefix+" "+tmp.substring(0, 3);}else{prefix=prefix+" __nil__";} if(tmp.length()>=4){ prefix=prefix+" "+tmp.substring(0, 4);}else{prefix=prefix+" __nil__";} if(tmp.length()>=5){ prefix=prefix+" "+tmp.substring(0, 5);}else{prefix=prefix+" __nil__";} //suffix String suffix=""; tmp=tokens[p]; if(tmp.length()>=1){ suffix=tmp.substring(tmp.length()-1, tmp.length());}else{suffix="__nil__";} if(tmp.length()>=2){ suffix=suffix+" "+tmp.substring(tmp.length()-2, tmp.length());}else{suffix=suffix+" __nil__";} if(tmp.length()>=3){ suffix=suffix+" "+tmp.substring(tmp.length()-3, tmp.length());}else{suffix=suffix+" __nil__";} if(tmp.length()>=4){ suffix=suffix+" "+tmp.substring(tmp.length()-4, tmp.length());}else{suffix=suffix+" __nil__";} if(tmp.length()>=5){ suffix=suffix+" "+tmp.substring(tmp.length()-5, tmp.length());}else{suffix=suffix+" __nil__";} if(State.equals("")) { State="O"; } if((!tokens[p].equals("\t"))) { if(TrainTest.equals("Train")) { FileData.write(tokens[p]+" "+stem+" "+WSB+" "+WSF+" "+Num_num+" "+Num_Uc+" "+Num_lc+" "+Num_All+" "+SpecificC+" "+ChemPreSuf+" "+MentionType+" "+ProteinSym+" "+prefix+" "+suffix+" "+CTDGene_State+" "+Abb_State+" "+State+"\n"); } else { FileData.write(tokens[p]+" "+stem+" "+WSB+" "+WSF+" "+Num_num+" "+Num_Uc+" "+Num_lc+" "+Num_All+" "+SpecificC+" "+ChemPreSuf+" "+MentionType+" "+ProteinSym+" "+prefix+" "+suffix+" "+CTDGene_State+" "+Abb_State+"\n"); } } PassageContext_tmp=PassageContext_tmp.substring(tokens[p].length()); // remove the token for the context Offset=Offset+tokens[p].length(); } } } if(tokens.length>0) { FileLocation.write("\n"); FileData.write("\n"); } } } FileLocation.close(); FileData.close(); } catch(IOException e1){ System.out.println("[MR]: Input file is not exist.");} } /* * Testing by CRF++ */ public void CRF_test(String model, String FilenameData, String FilenameOutput) throws IOException { File f = new File(FilenameOutput); BufferedWriter fr = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(f), "UTF-8")); Runtime runtime = Runtime.getRuntime(); String OS=System.getProperty("os.name").toLowerCase(); String cmd="./CRF/crf_test -m "+model+" -o "+FilenameOutput+" "+FilenameData; if(OS.contains("windows")) { cmd ="CRF/crf_test -m "+model+" -o "+FilenameOutput+" "+FilenameData; } else //if(OS.contains("nux")||OS.contains("nix")) { cmd ="./CRF/crf_test -m "+model+" -o "+FilenameOutput+" "+FilenameData; } try { Process process = runtime.exec(cmd); InputStream is = process.getInputStream(); InputStreamReader isr = new InputStreamReader(is, "UTF-8"); BufferedReader br = new BufferedReader(isr); String line=""; while ( (line = br.readLine()) != null) { fr.write(line); fr.newLine(); fr.flush(); } is.close(); isr.close(); br.close(); fr.close(); } catch (IOException e) { System.out.println(e); runtime.exit(0); } } public void CRF_test(String model,String FilenameData,String FilenameOutput,String top3) throws IOException { File f = new File(FilenameOutput); BufferedWriter fr = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(f), "UTF-8")); Runtime runtime = Runtime.getRuntime(); String OS=System.getProperty("os.name").toLowerCase(); String cmd="./CRF/crf_test -n 3 -m "+model+" -o "+FilenameOutput+" "+FilenameData; if(OS.contains("windows")) { cmd ="CRF/crf_test -n 3 -m "+model+" -o "+FilenameOutput+" "+FilenameData; } else //if(OS.contains("nux")||OS.contains("nix")) { cmd ="./CRF/crf_test -n 3 -m "+model+" -o "+FilenameOutput+" "+FilenameData; } try { Process process = runtime.exec(cmd); InputStream is = process.getInputStream(); InputStreamReader isr = new InputStreamReader(is, "UTF-8"); BufferedReader br = new BufferedReader(isr); String line=""; while ( (line = br.readLine()) != null) { fr.write(line); fr.newLine(); fr.flush(); } is.close(); isr.close(); br.close(); fr.close(); } catch (IOException e) { System.out.println(e); runtime.exit(0); } } /* * Learning model by CRF++ */ public void CRF_learn(String model, String FilenameData) throws IOException { Runtime runtime = Runtime.getRuntime(); Process process = null; String line = null; InputStream is = null; InputStreamReader isr = null; BufferedReader br = null; String OS=System.getProperty("os.name").toLowerCase(); String cmd="./CRF/crf_learn -f 3 -c 4.0 CRF/template_UB "+FilenameData+" "+model; if(OS.contains("windows")) { cmd ="CRF/crf_learn -f 3 -c 4.0 CRF/template_UB "+FilenameData+" "+model; } else //if(OS.contains("nux")||OS.contains("nix")) { cmd ="./CRF/crf_learn -f 3 -c 4.0 CRF/template_UB "+FilenameData+" "+model; } try { process = runtime.exec(cmd); is = process.getInputStream(); isr = new InputStreamReader(is, "UTF-8"); br = new BufferedReader(isr); while ( (line = br.readLine()) != null) { System.out.println(line); System.out.flush(); } is.close(); isr.close(); br.close(); } catch (IOException e) { System.out.println(e); runtime.exit(0); } } public void ReadCRFresult(String Filename,String FilenameLoca,String FilenameOutput,String FilenameBioC) throws XMLStreamException, IOException { /** load CRF output */ ArrayList outputArr = new ArrayList(); BufferedReader inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(FilenameOutput), "UTF-8")); String line; while ((line = inputfile.readLine()) != null) { outputArr.add(line); } inputfile.close(); /** load location */ ArrayList locationArr = new ArrayList(); inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(FilenameLoca), "UTF-8")); while ((line = inputfile.readLine()) != null) { locationArr.add(line); } inputfile.close(); /** output -> mentions */ String pmid_last=""; String paragraph_num_last=""; String pmid=""; String paragraph=""; String paragraph_num=""; Pattern pat_B = Pattern.compile("((FamilyName|DomainMotif|Gene)_[B])$"); Pattern pat_IE = Pattern.compile("((FamilyName|DomainMotif|Gene)_[IE])$"); ArrayList> AnnotationInPMID = new ArrayList(); // array of Annotations in the PMIDs ArrayList AnnotationInPassage= new ArrayList(); // array of Annotations in the Passage GNormPlus.BioCDocobj.Annotations = new ArrayList(); int countPMID=0; int countPassage=0; /** outputArr */ for(int i=0;i3) { pmid=locationRow[0]; paragraph=locationRow[1]; paragraph_num=locationRow[2]; } if( (!paragraph_num_last.equals("")) && (!paragraph_num.equals(paragraph_num_last)) ) { AnnotationInPMID.add(AnnotationInPassage); AnnotationInPassage = new ArrayList(); countPassage++; } if( (!pmid_last.equals("")) && (!pmid.equals(pmid_last)) ) { GNormPlus.BioCDocobj.Annotations.add(AnnotationInPMID); AnnotationInPMID = new ArrayList(); countPMID++; countPassage=0; } boolean F = false; //Flag of Finding if(locationRow.length>2) { Matcher mat = pat_B.matcher(outputsRow[outputsRow.length-1]); // last column : Status while(mat.find() && locationRow.length==6) { MentionType=mat.group(2); pmid=locationRow[0]; paragraph_num=locationRow[2]; int start_tmp=Integer.parseInt(locationRow[4])-1; int last_tmp=Integer.parseInt(locationRow[5]); if(start_tmplast){last=last_tmp;} i++; F = true; if(locationArr.get(i).length()>0) { outputsRow=outputArr.get(i).split("\\t"); locationRow=locationArr.get(i).split("\\t"); mat = pat_IE.matcher(outputsRow[outputsRow.length-1]); } else { break; } } } if(F == true) { String PassageContext = GNormPlus.BioCDocobj.PassageContexts.get(countPMID).get(countPassage); // Passage context String Mention = PassageContext.substring(start, last); String Mention_nospace = Mention.replaceAll("[\\W\\-\\_]", ""); if(Mention.toLowerCase().matches("(figure|tables|fig|tab|exp\\. [0-9]+).*")){} else if(Mention.matches("[A-Z][A-Z]s")){} else if(Mention.matches(".*\\|.*")){} else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\;\\,\\'\\/\\\\].*")){} else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\(].*") && !Mention.matches(".*[\\)].*")){} else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\[].*") && !Mention.matches(".*[\\]].*")){} else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\)].*") && !Mention.matches(".*[\\(].*")){} else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\]].*") && !Mention.matches(".*[\\[].*")){} else { AnnotationInPassage.add(start+"\t"+last+"\t"+Mention+"\t"+MentionType); } i--; } paragraph_num_last=paragraph_num; pmid_last=pmid; }// outputArr1 AnnotationInPMID.add(AnnotationInPassage); GNormPlus.BioCDocobj.Annotations.add(AnnotationInPMID); //GNormPlus.BioCDocobj.BioCOutput(Filename,FilenameBioC,GNormPlus.BioCDocobj.Annotations,false,false); //save in BioC file } public void ReadCRFresult(String Filename,String FilenameLoca,String FilenameOutput,String FilenameBioC,double threshold,double threshold_GeneType) throws XMLStreamException, IOException { /** load CRF output */ ArrayList outputArr1 = new ArrayList(); ArrayList outputArr2 = new ArrayList(); ArrayList outputArr3 = new ArrayList(); ArrayList outputArr1_score = new ArrayList(); ArrayList outputArr2_score = new ArrayList(); ArrayList outputArr3_score = new ArrayList(); BufferedReader inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(FilenameOutput), "UTF-8")); String line; int rank=0; String score=""; Pattern pat_Rank = Pattern.compile("^# ([0-2]) ([0-9\\.]+)$"); while ((line = inputfile.readLine()) != null) { Matcher mat = pat_Rank.matcher(line); // last column : Status if(mat.find()) { rank = Integer.parseInt(mat.group(1)); score = mat.group(2); } else if(rank == 0) { outputArr1.add(line); outputArr1_score.add(score); } else if(rank == 1) { outputArr2.add(line); outputArr2_score.add(score); } else if(rank == 2) { outputArr3.add(line); outputArr3_score.add(score); } } inputfile.close(); /** load location */ ArrayList locationArr = new ArrayList(); inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(FilenameLoca), "UTF-8")); while ((line = inputfile.readLine()) != null) { locationArr.add(line); } inputfile.close(); /** output -> mentions */ String pmid_last=""; String paragraph_num_last=""; String pmid=""; String paragraph=""; String paragraph_num=""; Pattern pat_B = Pattern.compile("((FamilyName|DomainMotif|Gene)_[B])$"); Pattern pat_IE = Pattern.compile("((FamilyName|DomainMotif|Gene)_[IE])$"); ArrayList> AnnotationInPMID = new ArrayList(); // array of Annotations in the PMIDs ArrayList AnnotationInPassage= new ArrayList(); // array of Annotations in the Passage GNormPlus.BioCDocobj.Annotations = new ArrayList(); int countPMID=0; int countPassage=0; /** outputArr1 */ int size_Arr=outputArr1.size(); if(locationArr.size()3) { pmid=locationRow[0]; paragraph=locationRow[1]; paragraph_num=locationRow[2]; } boolean F = false; //Flag of Finding if(outputsRow.length>=1) { Matcher mat = pat_B.matcher(outputsRow[outputsRow.length-1]); // last column : Status while(mat.find() && locationRow.length==6) { MentionType=mat.group(2); pmid=locationRow[0]; int start_tmp=Integer.parseInt(locationRow[4])-1; int last_tmp=Integer.parseInt(locationRow[5]); if(start_tmplast){last=last_tmp;} i++; outputsRow=outputArr1.get(i).split("\\t"); locationRow=locationArr.get(i).split("\\t"); mat = pat_IE.matcher(outputsRow[outputsRow.length-1]); F = true; } } if( (!paragraph_num_last.equals("")) && (!paragraph_num.equals(paragraph_num_last)) ) // paragraph change { AnnotationInPMID.add(AnnotationInPassage); AnnotationInPassage = new ArrayList(); countPassage++; } if( !pmid.equals(pmid_last) && paragraph_num.equals("0") && paragraph_num_last.equals("0") ) // pmid change (special case : the article only has one paragrpah) { AnnotationInPMID.add(AnnotationInPassage); AnnotationInPassage = new ArrayList(); GNormPlus.BioCDocobj.Annotations.add(AnnotationInPMID); AnnotationInPMID = new ArrayList(); countPMID++; countPassage=0; } else if( (!pmid_last.equals("")) && (!pmid.equals(pmid_last)) ) // pmid change { GNormPlus.BioCDocobj.Annotations.add(AnnotationInPMID); AnnotationInPMID = new ArrayList(); countPMID++; countPassage=0; } if(F == true) { if(GNormPlus.BioCDocobj.PassageContexts.size()>countPMID && GNormPlus.BioCDocobj.PassageContexts.get(countPMID).size()>countPassage && GNormPlus.BioCDocobj.PassageContexts.get(countPMID).get(countPassage).length()>=last && (last-start)<1000) { String PassageContext = GNormPlus.BioCDocobj.PassageContexts.get(countPMID).get(countPassage); // Passage context String Mention = PassageContext.substring(start, last); String Mention_nospace = Mention.replaceAll("[\\W\\-\\_]", ""); if(Mention.toLowerCase().matches("(figure|tables|fig|tab|exp\\. [0-9]+).*")){} else if(Mention.matches("[A-Z][A-Z]s")){} else if(Mention.matches(".*\\|.*")){} else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\;\\,\\'\\/\\\\].*")){} else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\(].*") && !Mention.matches(".*[\\)].*")){} else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\[].*") && !Mention.matches(".*[\\]].*")){} else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\)].*") && !Mention.matches(".*[\\(].*")){} else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\]].*") && !Mention.matches(".*[\\[].*")){} else if((GNormPlus.Abb2Longformtok_hash.containsKey(Mention_nospace.toLowerCase())) && (PassageContext.toLowerCase().matches(".*[\\W\\-\\-]("+GNormPlus.Abb2Longformtok_hash.get(Mention_nospace.toLowerCase())+")[\\W\\-\\-].*"))) { //System.out.println(Mention_nospace.toLowerCase()+"\t"+GNormPlus.Abb2Longformtok_hash.get(Mention_nospace.toLowerCase())); } else { AnnotationInPassage.add(start+"\t"+last+"\t"+Mention+"\t"+MentionType); } } i--; } paragraph_num_last=paragraph_num; pmid_last=pmid; }// outputArr1 AnnotationInPMID.add(AnnotationInPassage); GNormPlus.BioCDocobj.Annotations.add(AnnotationInPMID); /** outputArr2 */ pmid_last=""; paragraph_num_last=""; pmid=""; paragraph=""; paragraph_num=""; countPMID=0; countPassage=0; size_Arr=outputArr2.size(); if(locationArr.size()2) { pmid=locationRow[0]; paragraph=locationRow[1]; paragraph_num=locationRow[2]; } boolean F = false; //Flag of Finding if(outputsRow.length>=1) { Matcher mat = pat_B.matcher(outputsRow[outputsRow.length-1]); // last column : Status while(mat.find() && locationRow.length==6) { MentionType=mat.group(2); pmid=locationRow[0]; int start_tmp=Integer.parseInt(locationRow[4])-1; int last_tmp=Integer.parseInt(locationRow[5]); if(start_tmplast){last=last_tmp;} i++; outputsRow=outputArr2.get(i).split("\\t"); locationRow=locationArr.get(i).split("\\t"); mat = pat_IE.matcher(outputsRow[outputsRow.length-1]); F = true; } } if( (!paragraph_num_last.equals("")) && (!paragraph_num.equals(paragraph_num_last)) ) // paragraph change { countPassage++; } if( !pmid.equals(pmid_last) && paragraph_num.equals("0") && paragraph_num_last.equals("0") ) // pmid change (special case : the article only has one paragrpah) { countPMID++; countPassage=0; } else if( (!pmid_last.equals("")) && (!pmid.equals(pmid_last)) ) // pmid change { countPMID++; countPassage=0; } if(F == true) { if(GNormPlus.BioCDocobj.PassageContexts.size()>countPMID && GNormPlus.BioCDocobj.PassageContexts.get(countPMID).size()>countPassage && GNormPlus.BioCDocobj.PassageContexts.get(countPMID).get(countPassage).length()>=last && (last-start)<1000) { String PassageContext = GNormPlus.BioCDocobj.PassageContexts.get(countPMID).get(countPassage); // Passage context String Mention = PassageContext.substring(start, last); String Mention_nospace = Mention.replaceAll("[\\W\\-\\_]", ""); if(Mention.toLowerCase().matches("(figure|tables|fig|tab|exp\\. [0-9]+).*")){} else if(Mention.matches("[A-Z][A-Z]s")){} else if(Mention.matches(".*\\|.*")){} else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\;\\,\\'\\/\\\\].*")){} else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\(].*") && !Mention.matches(".*[\\)].*")){} else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\[].*") && !Mention.matches(".*[\\]].*")){} else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\)].*") && !Mention.matches(".*[\\(].*")){} else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\]].*") && !Mention.matches(".*[\\[].*")){} else if((GNormPlus.Abb2Longformtok_hash.containsKey(Mention_nospace.toLowerCase())) && (PassageContext.toLowerCase().matches(".*[\\W\\-\\-]("+GNormPlus.Abb2Longformtok_hash.get(Mention_nospace.toLowerCase())+")[\\W\\-\\-].*"))) { //System.out.println(Mention_nospace.toLowerCase()+"\t"+GNormPlus.Abb2Longformtok_hash.get(Mention_nospace.toLowerCase())); } else if(Double.parseDouble(outputArr2_score.get(i))>threshold) { boolean overlap=false; for(int j=0;jthreshold_GeneType && GNormPlus.BioCDocobj.Annotations.get(countPMID).get(countPassage).get(j).matches(start+"\t"+last+"\t"+Mention_tmp+"\t(FamilyName|DomainMotif)") ) { GNormPlus.BioCDocobj.Annotations.get(countPMID).get(countPassage).set(j, start+"\t"+last+"\t"+Mention+"\t"+MentionType); } else if( (start>=startj && startstartj && last<=lastj) ) { overlap=true; } } if(overlap == false) { GNormPlus.BioCDocobj.Annotations.get(countPMID).get(countPassage).add(start+"\t"+last+"\t"+Mention+"\t"+MentionType); } } } i--; } paragraph_num_last=paragraph_num; pmid_last=pmid; }// outputArr2 /** outputArr3 */ pmid_last=""; paragraph_num_last=""; pmid=""; paragraph=""; paragraph_num=""; countPMID=0; countPassage=0; size_Arr=outputArr3.size(); if(locationArr.size()2) { pmid=locationRow[0]; paragraph=locationRow[1]; paragraph_num=locationRow[2]; } boolean F = false; //Flag of Finding if(outputsRow.length>=1) { Matcher mat = pat_B.matcher(outputsRow[outputsRow.length-1]); // last column : Status while(mat.find() && locationRow.length==6) { MentionType=mat.group(2); pmid=locationRow[0]; paragraph_num=locationRow[2]; int start_tmp=Integer.parseInt(locationRow[4])-1; int last_tmp=Integer.parseInt(locationRow[5]); if(start_tmplast){last=last_tmp;} i++; outputsRow=outputArr3.get(i).split("\\t"); locationRow=locationArr.get(i).split("\\t"); mat = pat_IE.matcher(outputsRow[outputsRow.length-1]); F = true; } } if( (!paragraph_num_last.equals("")) && (!paragraph_num.equals(paragraph_num_last)) ) // paragraph change { countPassage++; } if( !pmid.equals(pmid_last) && paragraph_num.equals("0") && paragraph_num_last.equals("0") ) // pmid change (special case : the article only has one paragrpah) { countPMID++; countPassage=0; } else if( (!pmid_last.equals("")) && (!pmid.equals(pmid_last)) ) // pmid change { countPMID++; countPassage=0; } if(F == true) { if(GNormPlus.BioCDocobj.PassageContexts.size()>countPMID && GNormPlus.BioCDocobj.PassageContexts.get(countPMID).size()>countPassage && GNormPlus.BioCDocobj.PassageContexts.get(countPMID).get(countPassage).length()>=last && (last-start)<1000) { String PassageContext = GNormPlus.BioCDocobj.PassageContexts.get(countPMID).get(countPassage); // Passage context String Mention = PassageContext.substring(start, last); String Mention_nospace = Mention.replaceAll("[\\W\\-\\_]", ""); if(Mention.toLowerCase().matches("(figure|tables|fig|tab|exp\\. [0-9]+).*")){} else if(Mention.matches("[A-Z][A-Z]s")){} else if(Mention.matches(".*\\|.*")){} else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\;\\,\\'\\/\\\\].*")){} else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\(].*") && !Mention.matches(".*[\\)].*")){} else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\[].*") && !Mention.matches(".*[\\]].*")){} else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\)].*") && !Mention.matches(".*[\\(].*")){} else if(Mention_nospace.length()<=3 && Mention.matches(".*[0-9].*") && Mention.matches(".*[\\]].*") && !Mention.matches(".*[\\[].*")){} else if((GNormPlus.Abb2Longformtok_hash.containsKey(Mention_nospace.toLowerCase())) && (PassageContext.toLowerCase().matches(".*[\\W\\-\\-]("+GNormPlus.Abb2Longformtok_hash.get(Mention_nospace.toLowerCase())+")[\\W\\-\\-].*"))) { //System.out.println(Mention_nospace.toLowerCase()+"\t"+GNormPlus.Abb2Longformtok_hash.get(Mention_nospace.toLowerCase())); } else if(Double.parseDouble(outputArr3_score.get(i))>threshold) { boolean overlap=false; for(int j=0;jthreshold_GeneType && GNormPlus.BioCDocobj.Annotations.get(countPMID).get(countPassage).get(j).matches(start+"\t"+last+"\t"+Mention_tmp+"\t(FamilyName|DomainMotif)") ) { GNormPlus.BioCDocobj.Annotations.get(countPMID).get(countPassage).set(j, start+"\t"+last+"\t"+Mention+"\t"+MentionType); } else if( (start>=startj && startstartj && last<=lastj) ) { overlap=true; } } if(overlap == false) { GNormPlus.BioCDocobj.Annotations.get(countPMID).get(countPassage).add(start+"\t"+last+"\t"+Mention+"\t"+MentionType); } } } i--; } paragraph_num_last=paragraph_num; pmid_last=pmid; }// outputArr3 //GNormPlus.BioCDocobj.BioCOutput(Filename,FilenameBioC,GNormPlus.BioCDocobj.Annotations,false,false); //save in BioC file } public void PostProcessing(String Filename,String FilenameBioC) throws XMLStreamException, IOException { /** Develop Cell | FamilyName | DomainMotif lists */ String Disease_Suffix="disease|diseases|syndrome|syndromes|tumor|tumour|deficiency|dysgenesis|atrophy|frame|dystrophy"; String Cell_Suffix="cell|cells"; String FamilyName_Suffix="disease|diseases|syndrome|syndromes|tumor|tumour|deficiency|dysgenesis|atrophy|frame|dystrophy|frame|factors|family|families|superfamily|superfamilies|subfamily|subfamilies|complex|genes|proteins"; String DomainMotif_Suffix="domain|motif|domains|motifs|sequences"; String Strain_Suffix="alpha|beta|gamma|kappa|theta|delta|[A-Ga-g0-9]"; ArrayList Translate2Family = new ArrayList(); for(int i=0;i Mention2Type_Hash = new HashMap(); // for substring detection - Extract all mentions in the target PMID : MentionList ArrayList GeneMentionPattern = new ArrayList(); // pattern match to extend Gene HashMap MentionType2Num = new HashMap(); // for frequency calculation if(GNormPlus.BioCDocobj.PMIDs.size()>=i) { String pmid=GNormPlus.BioCDocobj.PMIDs.get(i); for(int j=0;j RemoveList = new ArrayList(); for(int k=0;k Family name (TIF & TIF1) */ boolean SubSt=false; /* // GDNFb -> GDNF (not work on 12682085_J_Cell_Biol_2003.xml) for (String men : Mention2Type_Hash.keySet()) { if((!men.equals(mention.toLowerCase())) && men.matches(mention_tmp+"[\\W\\-\\_]*("+Strain_Suffix+")")) { GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, start+"\t"+last+"\t"+mention+"\tFamilyName"); if(GNormPlus.PmidLF2Abb_lc_hash.containsKey(GNormPlus.BioCDocobj.PMIDs.get(i)+"\t"+mention.toLowerCase())) { Translate2Family.add(GNormPlus.PmidLF2Abb_lc_hash.get(GNormPlus.BioCDocobj.PMIDs.get(i)+"\t"+mention.toLowerCase())); } else if(GNormPlus.PmidAbb2LF_lc_hash.containsKey(GNormPlus.BioCDocobj.PMIDs.get(i)+"\t"+mention.toLowerCase())) { Translate2Family.add(GNormPlus.PmidAbb2LF_lc_hash.get(GNormPlus.BioCDocobj.PMIDs.get(i)+"\t"+mention.toLowerCase())); } SubSt=true; break; } } */ if(SubSt == false) { int BoundaryLen=15; if(GNormPlus.BioCDocobj.PassageContexts.get(i).get(j).length() Family/Domain/Cell */ if( mention.toLowerCase().matches(".*("+Cell_Suffix+")") || SurroundingString.matches("("+Cell_Suffix+")") ) { type="Cell"; GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, start+"\t"+last+"\t"+mention+"\t"+type); } else if( mention.toLowerCase().matches(".*("+FamilyName_Suffix+")") || SurroundingString.matches("("+FamilyName_Suffix+")") ) { type="FamilyName"; GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, start+"\t"+last+"\t"+mention+"\t"+type); } else if( mention.toLowerCase().matches(".*("+DomainMotif_Suffix+")")|| SurroundingString.matches("("+DomainMotif_Suffix+")") ) { type="DomainMotif"; GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, start+"\t"+last+"\t"+mention+"\t"+type); } else if(!type.equals("Gene")) { /* 3. Check (Family+Domain+Cell)/All rate (threshold = 0.5) - Family/Domain/Cell -> Gene */ double Num_FDC=0; double Num_Gene=0; if(MentionType2Num.containsKey(mention.toLowerCase()+"\tFamilyName")) { Num_FDC = Num_FDC + MentionType2Num.get(mention.toLowerCase()+"\tFamilyName"); } if(MentionType2Num.containsKey(mention.toLowerCase()+"\tDomainMotif")) { Num_FDC = Num_FDC + MentionType2Num.get(mention.toLowerCase()+"\tDomainMotif"); } if(MentionType2Num.containsKey(mention.toLowerCase()+"\tCell")) { Num_FDC = Num_FDC + MentionType2Num.get(mention.toLowerCase()+"\tCell"); } if(MentionType2Num.containsKey(mention.toLowerCase()+"\tGene")) { Num_Gene = Num_Gene + MentionType2Num.get(mention.toLowerCase()+"\tGene"); } if(Num_Gene/(Num_FDC+Num_Gene)>=0.5) { GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, start+"\t"+last+"\t"+mention+"\tGene"); } /* 4. Extend Genes to Family/Domain mentions by pattern match - Family/Domain/Cell -> Gene */ for(int p=0;p Abb.type * - Abb only : Abb.type -> LF.type * - LF only : LF.type -> Abb.type */ String lc_ment=mention.toLowerCase(); if(GNormPlus.PmidAbb2LF_lc_hash.containsKey(pmid+"\t"+lc_ment)) //the target mention is abbreviation { //Infer Abbreviation by Long form if(GNormPlus.PmidAbb2LF_lc_hash.get(pmid+"\t"+lc_ment).matches(".*("+Disease_Suffix+")")) { //remove the mention (Abb), because the LF is a disease } else if(GNormPlus.PmidAbb2LF_lc_hash.get(pmid+"\t"+lc_ment).matches(".*("+Cell_Suffix+")")) { //GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, Anno[0]+"\t"+Anno[1]+"\tCell"); } else if(GNormPlus.PmidAbb2LF_lc_hash.get(pmid+"\t"+lc_ment).matches(".*("+FamilyName_Suffix+")") && !lc_ment.matches(".+[a-z][0-9][a-z]")) //AtRPA1a in pmid:19153602 { GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, start+"\t"+last+"\t"+mention+"\tFamilyName"); } else if(GNormPlus.PmidAbb2LF_lc_hash.get(pmid+"\t"+lc_ment).matches(".*("+DomainMotif_Suffix+")")) { GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, start+"\t"+last+"\t"+mention+"\tDomainMotif"); } else { if(Mention2Type_Hash.containsKey(GNormPlus.PmidAbb2LF_lc_hash.get(pmid+"\t"+lc_ment)) && Mention2Type_Hash.get(GNormPlus.PmidAbb2LF_lc_hash.get(pmid+"\t"+lc_ment)).equals("Gene") && !(type.equals("Gene")) ) // if Long Form is recognized as a Gene, and Abb is recognized as not a Gene { GNormPlus.BioCDocobj.Annotations.get(i).get(j).set(k, start+"\t"+last+"\t"+mention+"\tGene"); } } } } //if(Remov == true) } } for(int j=0;j GeneMentionPattern = new ArrayList(); // pattern match to extend Gene HashMap GeneMentions = new HashMap(); // Extending Gene mentions HashMap GeneMentionLocationGNR = new HashMap(); // Extending Gene mentions for(int j=0;ji && GNormPlus.BioCDocobj.PassageContexts.get(i).size()>j) { String PassageContexts = " " + GNormPlus.BioCDocobj.PassageContexts.get(i).get(j) + " "; String PassageContexts_tmp = PassageContexts.toLowerCase(); for(String gm : GeneMentions.keySet()) { String type=GeneMentions.get(gm); if(type.equals("Gene")) { gm = gm.replaceAll("([\\W\\-\\_])", "\\\\$1"); gm=gm.replaceAll("[0-9]", "\\[0\\-9\\]"); gm=gm.replaceAll("(alpha|beta|gamma|theta|zeta|delta)", "(alpha\\|beta\\|gamma\\|theta\\|zeta\\|delta)"); gm=gm.replaceAll("\\-[a-z]$", "\\-\\[a\\-z\\]"); Pattern ptmp = Pattern.compile("^(.*[\\W\\-\\_])("+gm+")([\\W\\-\\_].*)$"); Matcher mtmp = ptmp.matcher(PassageContexts_tmp); while(mtmp.find()) { String pre = mtmp.group(1); String gmtmp = mtmp.group(2); String post = mtmp.group(3); int start = pre.length()-1; int last = start+gmtmp.length(); if(PassageContexts.length()>last) { String mention = PassageContexts.substring(start+1,last+1); if(!GeneMentionLocationGNR.containsKey(j+"\t"+start) && !GeneMentionLocationGNR.containsKey(j+"\t"+last)) { if(GNormPlus.BioCDocobj.Annotations.get(i).get(j).contains(start+"\t"+last+"\t"+mention+"\tFamilyName")) { GNormPlus.BioCDocobj.Annotations.get(i).get(j).remove(start+"\t"+last+"\t"+mention+"\tFamilyName"); } else if(GNormPlus.BioCDocobj.Annotations.get(i).get(j).contains(start+"\t"+last+"\t"+mention+"\tDomainMotif")) { GNormPlus.BioCDocobj.Annotations.get(i).get(j).remove(start+"\t"+last+"\t"+mention+"\tDomainMotif"); } GNormPlus.BioCDocobj.Annotations.get(i).get(j).add(start+"\t"+last+"\t"+mention+"\tGene"); } gmtmp = gmtmp.replaceAll(".", "X"); PassageContexts_tmp=pre+""+gmtmp+""+post; mtmp = ptmp.matcher(PassageContexts_tmp); } } } } } } //Extend to all family mentions for(int j=0;ji && GNormPlus.BioCDocobj.PassageContexts.get(i).size()>j) { String PassageContexts = " " + GNormPlus.BioCDocobj.PassageContexts.get(i).get(j) + " "; String PassageContexts_tmp = PassageContexts.toLowerCase(); for(String gm : GeneMentions.keySet()) { String type=GeneMentions.get(gm); if(type.matches("(FamilyName|DomainMotif)")) { gm = gm.replaceAll("([\\W\\-\\_])", "\\\\$1"); gm=gm.replaceAll("s$", "(s\\|)"); Pattern ptmp = Pattern.compile("^(.*[\\W\\-\\_])("+gm+")([\\W\\-\\_].*)$"); Matcher mtmp = ptmp.matcher(PassageContexts_tmp); while(mtmp.find()) { String pre = mtmp.group(1); String gmtmp = mtmp.group(2); String post = mtmp.group(3); int start = pre.length()-1; int last = start+gmtmp.length(); if(PassageContexts.length()>last) { String mention = PassageContexts.substring(start+1,last+1); if(!GeneMentionLocationGNR.containsKey(j+"\t"+start) && !GeneMentionLocationGNR.containsKey(j+"\t"+last)) { if(!GNormPlus.BioCDocobj.Annotations.get(i).get(j).contains(start+"\t"+last+"\t"+mention+"\tGene")) { GNormPlus.BioCDocobj.Annotations.get(i).get(j).add(start+"\t"+last+"\t"+mention+"\t"+type); } } gmtmp = gmtmp.replaceAll(".", "X"); PassageContexts_tmp=pre+""+gmtmp+""+post; mtmp = ptmp.matcher(PassageContexts_tmp); } } } } } } } } GNormPlus.BioCDocobj.BioCOutput(Filename,FilenameBioC,GNormPlus.BioCDocobj.Annotations,false,false); //save in BioC file } }