/** * Project: GNormPlus * Function: SimConcept : Simplify Composite mentions */ package GNormPluslib; import bioc.BioCAnnotation; import bioc.BioCCollection; import bioc.BioCDocument; import bioc.BioCLocation; import bioc.BioCPassage; import bioc.io.BioCDocumentWriter; import bioc.io.BioCFactory; import bioc.io.woodstox.ConnectorWoodstox; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.text.BreakIterator; import java.time.LocalDate; import java.time.ZoneId; import java.text.DecimalFormat; import java.math.RoundingMode; import javax.xml.stream.XMLStreamException; import org.tartarus.snowball.SnowballStemmer; import org.tartarus.snowball.ext.englishStemmer; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Locale; public class SimConcept { /* * Feature Extraction */ public void FeatureExtraction_Train(String FilenameData) throws XMLStreamException { try { /** output files */ BufferedWriter FileData = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(FilenameData), "UTF-8")); // .data //NLP modules SnowballStemmer stemmer = new englishStemmer(); /** PMIDs : i */ for (int i = 0; i < GNormPlus.BioCDocobj.PMIDs.size(); i++) { String Pmid = GNormPlus.BioCDocobj.PMIDs.get(i); /** Paragraphs : j */ for (int j = 0; j < GNormPlus.BioCDocobj.PassageNames.get(i).size(); j++) { ArrayList Annotation = GNormPlus.BioCDocobj.Annotations.get(i).get(j); /** Annotations : k * 0 start * 1 last * 2 mention * 3 type * 4 id */ int Inital_Annotation_size=Annotation.size(); for (int k = 0; k < Annotation.size() ; k++) // k : Annotations { String anno[]=Annotation.get(k).split("\\t",-1); int MentionStart= Integer.parseInt(anno[0]); int MentionLast= Integer.parseInt(anno[1]); String Mention = anno[2]; String Type = anno[3]; if(anno.length>4) { String ID = anno[4]; String TokenSTR = Mention; TokenSTR = TokenSTR.replaceAll("([0-9])([A-Za-z])", "$1 $2"); TokenSTR = TokenSTR.replaceAll("([A-Za-z])([0-9])", "$1 $2"); TokenSTR = TokenSTR.replaceAll("([A-Z])([a-z])", "$1 $2"); TokenSTR = TokenSTR.replaceAll("([a-z])([A-Z])", "$1 $2"); TokenSTR = TokenSTR.replaceAll("([\\W])", " $1 "); TokenSTR = TokenSTR.replaceAll("[ ]+", " "); TokenSTR = TokenSTR.replaceAll("^[ ]+", ""); TokenSTR = TokenSTR.replaceAll("[ ]+$", ""); /* * Only for Gene */ if(ID.equals("ASJAS") && kInteger.parseInt(t2)) { tmp_ment=t1+" "+t2+" to "+t5; Annotation.add(MentionStart+"\t"+MentionLast+"\t"+tmp_ment+"\t"+Type+"\tASNS"); tmp_ment=t1+" "+t2+" to -"+t5; Annotation.add(MentionStart+"\t"+MentionLast+"\t"+tmp_ment+"\t"+Type+"\tASNOS"); tmp_ment=t1+" -"+t2+" to -"+t5; Annotation.add(MentionStart+"\t"+MentionLast+"\t"+tmp_ment+"\t"+Type+"\tAASNOS"); tmp_ment=t1+" "+t2+" to "+t1+" "+t5; Annotation.add(MentionStart+"\t"+MentionLast+"\t"+tmp_ment+"\t"+Type+"\tASNAS"); tmp_ment=t1+" "+t2+"-"+t5; Annotation.add(MentionStart+"\t"+MentionLast+"\t"+tmp_ment+"\t"+Type+"\tASNS"); tmp_ment=t1+" "+t2+", "+t5+", and "+(Integer.parseInt(t5)+2); Annotation.add(MentionStart+"\t"+MentionLast+"\t"+tmp_ment+"\t"+Type+"\tASCSCCS"); tmp_ment=t1+" -"+t2+", -"+t5+", and -"+(Integer.parseInt(t5)+2); Annotation.add(MentionStart+"\t"+MentionLast+"\t"+tmp_ment+"\t"+Type+"\tAASC0SCC0S"); } } } String Mention_tmp = Mention; String tokens[]=TokenSTR.split(" ",-1); //For Repeat HashMap Token2Num = new HashMap (); for(int p=0;p AbbLFStatus_hash = new HashMap (); for(String Pmid_LF : GNormPlus.PmidLF2Abb_hash.keySet()) { String pf[] = Pmid_LF.split("\\t",-1); if(pf[0].equals(Pmid)) { String Abb = GNormPlus.PmidLF2Abb_hash.get(Pmid_LF); String LF = pf[1]; Abb = Abb.replaceAll("([0-9])([A-Za-z])", "$1 $2"); Abb = Abb.replaceAll("([A-Za-z])([0-9])", "$1 $2"); Abb = Abb.replaceAll("([A-Z])([a-z])", "$1 $2"); Abb = Abb.replaceAll("([a-z])([A-Z])", "$1 $2"); Abb = Abb.replaceAll("([\\W])", " $1 "); Abb = Abb.replaceAll("[ ]+", " "); Abb = Abb.replaceAll("^[ ]+", ""); LF = LF.replaceAll("([0-9])([A-Za-z])", "$1 $2"); LF = LF.replaceAll("([A-Za-z])([0-9])", "$1 $2"); LF = LF.replaceAll("([A-Z])([a-z])", "$1 $2"); LF = LF.replaceAll("([a-z])([A-Z])", "$1 $2"); LF = LF.replaceAll("([\\W])", " $1 "); LF = LF.replaceAll("[ ]+", " "); LF = LF.replaceAll("^[ ]+", ""); LF = LF.replaceAll("[ ]+$", ""); Abb=Abb.replaceAll("([^A-Za-z0-9@ ])","\\\\$1"); LF=LF.replaceAll("([^A-Za-z0-9@ ])","\\\\$1"); Abb=Abb.toLowerCase(); LF=LF.toLowerCase(); Pattern ptmp1 = Pattern.compile("(.*)("+LF+")([ ]*\$[ ]*)("+Abb+")[ ]*\$.*"); Matcher mtmp1 = ptmp1.matcher(TokenSTR.toLowerCase()); Pattern ptmp2 = Pattern.compile("(.*)("+Abb+")([ ]*\$[ ]*)("+LF+")[ ]*\$.*"); Matcher mtmp2 = ptmp2.matcher(TokenSTR.toLowerCase()); int start_LF=0; int last_LF=0; int start_Abb=0; int last_Abb=0; if(mtmp1.find()) { start_LF = mtmp1.group(1).length(); last_LF = start_LF+mtmp1.group(2).length(); start_Abb = last_LF+mtmp1.group(3).length(); last_Abb = start_Abb+mtmp1.group(4).length(); } else if(mtmp2.find()) { start_Abb = mtmp2.group(1).length(); last_Abb = start_LF+mtmp2.group(2).length(); start_LF = last_LF+mtmp2.group(3).length(); last_LF = start_Abb+mtmp2.group(4).length(); } for(int l=start_LF;l0) { String B=tokens[p-1]; B=B.replaceAll("[A-Za-z]+", "A"); B=B.replaceAll("[0-9]+", "0"); WSB="WSB:"+B; } if(p3){Num_num="N:4+";}else{Num_num="N:"+ tmp.length();} //Number of Uppercase [A-Z] String Num_Uc=""; tmp=tokens[p]; tmp=tmp.replaceAll("[^A-Z]",""); if(tmp.length()>3){Num_Uc="U:4+";}else{Num_Uc="U:"+ tmp.length();} //Number of Lowercase [a-z] String Num_lc=""; tmp=tokens[p]; tmp=tmp.replaceAll("[^a-z]",""); if(tmp.length()>3){Num_lc="L:4+";}else{Num_lc="L:"+ tmp.length();} //Number of ALL char String Num_All=""; if(tokens[p].length()>3){Num_All="A:4+";}else{Num_All="A:"+ tokens[p].length();} //specific character (;:,.->+_) String SpecificC="__nil__"; if(tokens[p].equals(";") || tokens[p].equals(":") || tokens[p].equals(",") || tokens[p].equals(".") || tokens[p].equals("-") || tokens[p].equals(">") || tokens[p].equals("+") || tokens[p].equals("_")) { SpecificC="-SpecificC1-"; } else if(tokens[p].equals("(") || tokens[p].equals(")")) { SpecificC="-SpecificC2-"; } else if(tokens[p].equals("{") || tokens[p].equals("}")) { SpecificC="-SpecificC3-"; } else if(tokens[p].equals("[") || tokens[p].equals("]")) { SpecificC="-SpecificC4-"; } else if(tokens[p].equals("\\") || tokens[p].equals("/")) { SpecificC="-SpecificC5-"; } //Chemical Prefix/Suffix String ChemPreSuf="__nil__"; if(tokens[p].matches(".*(yl|ylidyne|oyl|sulfonyl)")){ChemPreSuf="-CHEMinlineSuffix-";} else if(tokens[p].matches("(meth|eth|prop|tetracos).*")){ChemPreSuf="-CHEMalkaneStem-";} else if(tokens[p].matches("(di|tri|tetra).*")){ChemPreSuf="-CHEMsimpleMultiplier-";} else if(tokens[p].matches("(benzen|pyridin|toluen).*")){ChemPreSuf="-CHEMtrivialRing-";} else if(tokens[p].matches(".*(one|ol|carboxylic|amide|ate|acid|ium|ylium|ide|uide|iran|olan|inan|pyrid|acrid|amid|keten|formazan|fydrazin)(s|)")){ChemPreSuf="-CHEMsuffix-";} //MentionType String MentionType="__nil__"; if(GNormPlus.SimConceptMention2Type_hash.containsKey(tokens[p])) { MentionType = "-"+GNormPlus.SimConceptMention2Type_hash.get(tokens[p])+"-"; } //Protein symbols String ProteinSym="__nil__"; if(tokens[p].matches(".*(glutamine|glutamic|leucine|valine|isoleucine|lysine|alanine|glycine|aspartate|methionine|threonine|histidine|aspartic|asparticacid|arginine|asparagine|tryptophan|proline|phenylalanine|cysteine|serine|glutamate|tyrosine|stop|frameshift).*")){ChemPreSuf="-ProteinSymFull-";} else if(tokens[p].matches("(cys|ile|ser|gln|met|asn|pro|lys|asp|thr|phe|ala|gly|his|leu|arg|trp|val|glu|tyr|fs|fsx)")){ChemPreSuf="-ProteinSymTri-";} else if(tokens[p].matches("[CISQMNPKDTFAGHLRWVEYX]")){ChemPreSuf="-ProteinSymChar-";} //Repeat String Repeat="__nil__"; if(Token2Num.get(tokens[p])>1 && tokens[p].length()>1 && (!tokens[p].matches("([\\W\\-\\_0-9]+|and|or|alpha|beta|gamma|theta|zeta|delta|kappa|II|VI|IV|III)"))) { Repeat="-Repeat-"; } //Patterns String Pattern1 = tokens[p]; if(Pattern1.matches(".*[\\W\\-\\_].*")) { Pattern1="__nil__"; } else { Pattern1=Pattern1.replaceAll("[A-Z]", "A"); Pattern1=Pattern1.replaceAll("[a-z]", "a"); Pattern1=Pattern1.replaceAll("[0-9]", "0"); Pattern1="P1:"+Pattern1; } String Pattern2 = tokens[p]; if(Pattern2.matches(".*[\\W\\-\\_].*")) { Pattern2="__nil__"; } else { Pattern2=Pattern2.replaceAll("[A-Za-z]", "a"); Pattern2=Pattern2.replaceAll("[0-9]", "0"); Pattern2="P2:"+Pattern2; } String Pattern3 = tokens[p]; if(Pattern3.matches(".*[\\W\\-\\_].*")) { Pattern3="__nil__"; } else { Pattern3=Pattern3.replaceAll("[A-Z]+", "A"); Pattern3=Pattern3.replaceAll("[a-z]+", "a"); Pattern3=Pattern3.replaceAll("[0-9]+", "0"); Pattern3="P3:"+Pattern3; } String Pattern4 = tokens[p]; if(Pattern4.matches(".*[\\W\\-\\_].*")) { Pattern4="__nil__"; } else { Pattern4=Pattern4.replaceAll("[A-Za-z]+", "a"); Pattern4=Pattern4.replaceAll("[0-9]+", "0"); Pattern4="P4:"+Pattern4; } //prefix String prefix=""; tmp=tokens[p]; if(tmp.length()>=1){ prefix=tmp.substring(0, 1);}else{prefix="__nil__";} if(tmp.length()>=2){ prefix=prefix+" "+tmp.substring(0, 2);}else{prefix=prefix+" __nil__";} if(tmp.length()>=3){ prefix=prefix+" "+tmp.substring(0, 3);}else{prefix=prefix+" __nil__";} if(tmp.length()>=4){ prefix=prefix+" "+tmp.substring(0, 4);}else{prefix=prefix+" __nil__";} if(tmp.length()>=5){ prefix=prefix+" "+tmp.substring(0, 5);}else{prefix=prefix+" __nil__";} //suffix String suffix=""; tmp=tokens[p]; if(tmp.length()>=1){ suffix=tmp.substring(tmp.length()-1, tmp.length());}else{suffix="__nil__";} if(tmp.length()>=2){ suffix=suffix+" "+tmp.substring(tmp.length()-2, tmp.length());}else{suffix=suffix+" __nil__";} if(tmp.length()>=3){ suffix=suffix+" "+tmp.substring(tmp.length()-3, tmp.length());}else{suffix=suffix+" __nil__";} if(tmp.length()>=4){ suffix=suffix+" "+tmp.substring(tmp.length()-4, tmp.length());}else{suffix=suffix+" __nil__";} if(tmp.length()>=5){ suffix=suffix+" "+tmp.substring(tmp.length()-5, tmp.length());}else{suffix=suffix+" __nil__";} //Abbreviation & Long Form String AbbLF="__nil__"; if(AbbLFStatus_hash.containsKey(Offset)) { AbbLF=AbbLFStatus_hash.get(Offset); } String Status = ID.substring(p, p+1); FileData.write(tokens[p]+" "+WSB+" "+WSF+" "+stem +" "+Num_num+" "+Num_num+" "+Num_Uc+" "+Num_lc+" "+Num_All+" "+SpecificC +" "+ChemPreSuf+" "+MentionType+" "+ProteinSym+" "+Repeat +" "+Pattern1+" "+Pattern2+" "+Pattern3+" "+Pattern4 +" "+prefix+" "+suffix+" "+AbbLF +" "+Status+"\n"); Offset=Offset+tokens[p].length()+1; if(ID.length()>tokens.length) { System.out.println(ID+"\t"+TokenSTR); } } FileData.write("\n"); } } } } FileData.close(); } catch(IOException e1){ System.out.println("[MR]: Input file is not exist.");} } public void FeatureExtraction_Test(String FilenameData) throws XMLStreamException { try { /** output files */ BufferedWriter FileData = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(FilenameData), "UTF-8")); // .data //NLP modules SnowballStemmer stemmer = new englishStemmer(); /** PMIDs : i */ for (int i = 0; i < GNormPlus.BioCDocobj.Annotations.size(); i++) { String Pmid = GNormPlus.BioCDocobj.PMIDs.get(i); /** Paragraphs : j */ for (int j = 0; j < GNormPlus.BioCDocobj.Annotations.get(i).size(); j++) { ArrayList Annotation = GNormPlus.BioCDocobj.Annotations.get(i).get(j); /** Annotations : k * 0 start * 1 last * 2 mention * 3 type * 4 id */ for (int k = 0; k < Annotation.size() ; k++) // k : Annotations { String anno[]=Annotation.get(k).split("\\t",-1); String Mentions = anno[2]; String Type = anno[3]; String MentionArr[]=Mentions.split("\\|",-1); if(Type.equals("Gene")) { for(int m=0;m Token2Num = new HashMap (); for(int p=0;p AbbLFStatus_hash = new HashMap (); for(String Pmid_LF : GNormPlus.PmidLF2Abb_hash.keySet()) { String pf[] = Pmid_LF.split("\\t",-1); if(pf[0].equals(Pmid)) { String Abb = GNormPlus.PmidLF2Abb_hash.get(Pmid_LF); String LF = pf[1]; Abb = Abb.replaceAll("([0-9])([A-Za-z])", "$1 $2"); Abb = Abb.replaceAll("([A-Za-z])([0-9])", "$1 $2"); Abb = Abb.replaceAll("([A-Z])([a-z])", "$1 $2"); Abb = Abb.replaceAll("([a-z])([A-Z])", "$1 $2"); Abb = Abb.replaceAll("([\\W])", " $1 "); Abb = Abb.replaceAll("[ ]+", " "); Abb = Abb.replaceAll("^[ ]+", ""); LF = LF.replaceAll("([0-9])([A-Za-z])", "$1 $2"); LF = LF.replaceAll("([A-Za-z])([0-9])", "$1 $2"); LF = LF.replaceAll("([A-Z])([a-z])", "$1 $2"); LF = LF.replaceAll("([a-z])([A-Z])", "$1 $2"); LF = LF.replaceAll("([\\W])", " $1 "); LF = LF.replaceAll("[ ]+", " "); LF = LF.replaceAll("^[ ]+", ""); Abb=Abb.replaceAll("([\\~\\!\\@\\#\\$\\%\\^\\&\\*\$\$\\_\\+\\-\\=\\[\\]\\;\\'\\,\\.\\/\\{\\}\\|\\:\\?])","\\\\$1"); LF=LF.replaceAll("([\\~\\!\\@\\#\\$\\%\\^\\&\\*\$\$\\_\\+\\-\\=\\[\\]\\;\\'\\,\\.\\/\\{\\}\\|\\:\\?])","\\\\$1"); Abb=Abb.toLowerCase(); LF=LF.toLowerCase(); Pattern ptmp1 = Pattern.compile("(.*)" + "("+LF+")" + "([ ]*\$[ ]*)" + "("+Abb+")" + "[ ]*\$.*"); Matcher mtmp1 = ptmp1.matcher(TokenSTR.toLowerCase()); Pattern ptmp2 = Pattern.compile("(.*)" + "("+Abb+")" + "([ ]*\$[ ]*)" + "("+LF+")" + "[ ]*\$.*"); Matcher mtmp2 = ptmp2.matcher(TokenSTR.toLowerCase()); int start_LF=0; int last_LF=0; int start_Abb=0; int last_Abb=0; if(mtmp1.find()) { start_LF = mtmp1.group(1).length(); last_LF = start_LF+mtmp1.group(2).length(); start_Abb = last_LF+mtmp1.group(3).length(); last_Abb = start_Abb+mtmp1.group(4).length(); } else if(mtmp2.find()) { start_Abb = mtmp2.group(1).length(); last_Abb = start_LF+mtmp2.group(2).length(); start_LF = last_LF+mtmp2.group(3).length(); last_LF = start_Abb+mtmp2.group(4).length(); } for(int l=start_LF;l0) { String B=tokens[p-1]; B=B.replaceAll("[A-Za-z]+", "A"); B=B.replaceAll("[0-9]+", "0"); WSB="WSB:"+B; } if(p3){Num_num="N:4+";}else{Num_num="N:"+ tmp.length();} //Number of Uppercase [A-Z] String Num_Uc=""; tmp=tokens[p]; tmp=tmp.replaceAll("[^A-Z]",""); if(tmp.length()>3){Num_Uc="U:4+";}else{Num_Uc="U:"+ tmp.length();} //Number of Lowercase [a-z] String Num_lc=""; tmp=tokens[p]; tmp=tmp.replaceAll("[^a-z]",""); if(tmp.length()>3){Num_lc="L:4+";}else{Num_lc="L:"+ tmp.length();} //Number of ALL char String Num_All=""; if(tokens[p].length()>3){Num_All="A:4+";}else{Num_All="A:"+ tokens[p].length();} //specific character (;:,.->+_) String SpecificC="__nil__"; if(tokens[p].equals(";") || tokens[p].equals(":") || tokens[p].equals(",") || tokens[p].equals(".") || tokens[p].equals("-") || tokens[p].equals(">") || tokens[p].equals("+") || tokens[p].equals("_")) { SpecificC="-SpecificC1-"; } else if(tokens[p].equals("(") || tokens[p].equals(")")) { SpecificC="-SpecificC2-"; } else if(tokens[p].equals("{") || tokens[p].equals("}")) { SpecificC="-SpecificC3-"; } else if(tokens[p].equals("[") || tokens[p].equals("]")) { SpecificC="-SpecificC4-"; } else if(tokens[p].equals("\\") || tokens[p].equals("/")) { SpecificC="-SpecificC5-"; } //Chemical Prefix/Suffix String ChemPreSuf="__nil__"; if(tokens[p].matches(".*(yl|ylidyne|oyl|sulfonyl)")){ChemPreSuf="-CHEMinlineSuffix-";} else if(tokens[p].matches("(meth|eth|prop|tetracos).*")){ChemPreSuf="-CHEMalkaneStem-";} else if(tokens[p].matches("(di|tri|tetra).*")){ChemPreSuf="-CHEMsimpleMultiplier-";} else if(tokens[p].matches("(benzen|pyridin|toluen).*")){ChemPreSuf="-CHEMtrivialRing-";} else if(tokens[p].matches(".*(one|ol|carboxylic|amide|ate|acid|ium|ylium|ide|uide|iran|olan|inan|pyrid|acrid|amid|keten|formazan|fydrazin)(s|)")){ChemPreSuf="-CHEMsuffix-";} //MentionType String MentionType="__nil__"; if(GNormPlus.SimConceptMention2Type_hash.containsKey(tokens[p])) { MentionType = "-"+GNormPlus.SimConceptMention2Type_hash.get(tokens[p])+"-"; } //Protein symbols String ProteinSym="__nil__"; if(tokens[p].matches(".*(glutamine|glutamic|leucine|valine|isoleucine|lysine|alanine|glycine|aspartate|methionine|threonine|histidine|aspartic|asparticacid|arginine|asparagine|tryptophan|proline|phenylalanine|cysteine|serine|glutamate|tyrosine|stop|frameshift).*")){ChemPreSuf="-ProteinSymFull-";} else if(tokens[p].matches("(cys|ile|ser|gln|met|asn|pro|lys|asp|thr|phe|ala|gly|his|leu|arg|trp|val|glu|tyr|fs|fsx)")){ChemPreSuf="-ProteinSymTri-";} else if(tokens[p].matches("[CISQMNPKDTFAGHLRWVEYX]")){ChemPreSuf="-ProteinSymChar-";} //Repeat String Repeat="__nil__"; if(Token2Num.get(tokens[p])>1 && tokens[p].length()>1 && (!tokens[p].matches("([\\W\\-\\_0-9]+|and|or|alpha|beta|gamma|theta|zeta|delta|kappa|II|VI|IV|III)"))) { Repeat="-Repeat-"; } //Patterns String Pattern1 = tokens[p]; if(Pattern1.matches(".*[\\W\\-\\_].*")) { Pattern1="__nil__"; } else { Pattern1=Pattern1.replaceAll("[A-Z]", "A"); Pattern1=Pattern1.replaceAll("[a-z]", "a"); Pattern1=Pattern1.replaceAll("[0-9]", "0"); Pattern1="P1:"+Pattern1; } String Pattern2 = tokens[p]; if(Pattern2.matches(".*[\\W\\-\\_].*")) { Pattern2="__nil__"; } else { Pattern2=Pattern2.replaceAll("[A-Za-z]", "a"); Pattern2=Pattern2.replaceAll("[0-9]", "0"); Pattern2="P2:"+Pattern2; } String Pattern3 = tokens[p]; if(Pattern3.matches(".*[\\W\\-\\_].*")) { Pattern3="__nil__"; } else { Pattern3=Pattern3.replaceAll("[A-Z]+", "A"); Pattern3=Pattern3.replaceAll("[a-z]+", "a"); Pattern3=Pattern3.replaceAll("[0-9]+", "0"); Pattern3="P3:"+Pattern3; } String Pattern4 = tokens[p]; if(Pattern4.matches(".*[\\W\\-\\_].*")) { Pattern4="__nil__"; } else { Pattern4=Pattern4.replaceAll("[A-Za-z]+", "a"); Pattern4=Pattern4.replaceAll("[0-9]+", "0"); Pattern4="P4:"+Pattern4; } //prefix String prefix=""; tmp=tokens[p]; if(tmp.length()>=1){ prefix=tmp.substring(0, 1);}else{prefix="__nil__";} if(tmp.length()>=2){ prefix=prefix+" "+tmp.substring(0, 2);}else{prefix=prefix+" __nil__";} if(tmp.length()>=3){ prefix=prefix+" "+tmp.substring(0, 3);}else{prefix=prefix+" __nil__";} if(tmp.length()>=4){ prefix=prefix+" "+tmp.substring(0, 4);}else{prefix=prefix+" __nil__";} if(tmp.length()>=5){ prefix=prefix+" "+tmp.substring(0, 5);}else{prefix=prefix+" __nil__";} //suffix String suffix=""; tmp=tokens[p]; if(tmp.length()>=1){ suffix=tmp.substring(tmp.length()-1, tmp.length());}else{suffix="__nil__";} if(tmp.length()>=2){ suffix=suffix+" "+tmp.substring(tmp.length()-2, tmp.length());}else{suffix=suffix+" __nil__";} if(tmp.length()>=3){ suffix=suffix+" "+tmp.substring(tmp.length()-3, tmp.length());}else{suffix=suffix+" __nil__";} if(tmp.length()>=4){ suffix=suffix+" "+tmp.substring(tmp.length()-4, tmp.length());}else{suffix=suffix+" __nil__";} if(tmp.length()>=5){ suffix=suffix+" "+tmp.substring(tmp.length()-5, tmp.length());}else{suffix=suffix+" __nil__";} //Abbreviation & Long Form String AbbLF="__nil__"; if(AbbLFStatus_hash.containsKey(Offset)) { AbbLF=AbbLFStatus_hash.get(Offset); } FileData.write(tokens[p]+" "+WSB+" "+WSF+" "+stem +" "+Num_num+" "+Num_num+" "+Num_Uc+" "+Num_lc+" "+Num_All+" "+SpecificC +" "+ChemPreSuf+" "+MentionType+" "+ProteinSym+" "+Repeat +" "+Pattern1+" "+Pattern2+" "+Pattern3+" "+Pattern4 +" "+prefix+" "+suffix+" "+AbbLF+"\n"); Offset=Offset+tokens[p].length()+1; } FileData.write("\n"); } } } } } FileData.close(); } catch(IOException e1){ System.out.println("[MR]: Input file is not exist.");} } public void CRF_test(String model, String FilenameData,String FilenameOutput) throws IOException { File f = new File(FilenameOutput); BufferedWriter fr = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(f), "UTF-8")); Runtime runtime = Runtime.getRuntime(); String cmd ="CRF/crf_test -m "+model+" -o "+FilenameOutput+" "+FilenameData; try { Process process = runtime.exec(cmd); InputStream is = process.getInputStream(); InputStreamReader isr = new InputStreamReader(is, "UTF-8"); BufferedReader br = new BufferedReader(isr); String line=""; while ( (line = br.readLine()) != null) { fr.write(line); fr.newLine(); fr.flush(); } is.close(); isr.close(); br.close(); fr.close(); } catch (IOException e) { System.out.println(e); runtime.exit(0); } } public void CRF_learn(String model,String FilenameData) throws IOException { Runtime runtime = Runtime.getRuntime(); Process process = null; String line = null; InputStream is = null; InputStreamReader isr = null; BufferedReader br = null; String cmd = "CRF/crf_learn -f 3 -c 4.0 CRF/template_SimConcept "+FilenameData+" "+model; try { process = runtime.exec(cmd); is = process.getInputStream(); isr = new InputStreamReader(is, "UTF-8"); br = new BufferedReader(isr); while ( (line = br.readLine()) != null) { System.out.println(line); System.out.flush(); } is.close(); isr.close(); br.close(); } catch (IOException e) { System.out.println(e); runtime.exit(0); } } public void ReadCRFresult(String Filename,String FilenameOutput,String FilenameBioC) throws XMLStreamException, IOException { /** load CRF output */ ArrayList outputArr1 = new ArrayList(); BufferedReader inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(FilenameOutput), "UTF-8")); String line; while ((line = inputfile.readLine()) != null) { outputArr1.add(line); } inputfile.close(); /** * Recognize the mentions which can be simplified */ int Count_mention=0; boolean Simplified=false; String Mention=""; String Mention_NoSpace=""; String States=""; HashMap Mentions_hash = new HashMap(); HashMap States_hash = new HashMap(); HashMap Output_Split_mention_Ind = new HashMap(); HashMap Output_Split_mention = new HashMap(); for(int i=0;i Split_mention = new ArrayList(); ArrayList Split_state = new ArrayList(); String tmp_mention=""; String tmp_state=""; /** * count = Mentions_count.get(i) : # of the mention in the corpus (543) * Mentions_hash.get(count) : Original Mention (ORP - 1 to ORP - 6) * States_hash.get(count) : States (AASNOOS) */ String TokenArr[]=Mentions_hash.get(MNoSpace).split(" ",-1); String StateArr[]=States_hash.get(MNoSpace).split("",-1); //refinement : isn't used Pattern ptmp1 = Pattern.compile("^([S]+)([CN])([S]+)$"); Matcher mtmp1 = ptmp1.matcher(States_hash.get(MNoSpace)); if(mtmp1.find()) { States_hash.put(MNoSpace, mtmp1.group(1)+"J"+mtmp1.group(3)); } //Split BE int len=TokenArr.length; if(StateArr.length0) { Split_mention.add(tmp_mention); Split_state.add(tmp_state); } tmp_mention = ""; tmp_state = ""; } else //CNBF { tmp_mention = tmp_mention + TokenArr[s] + " "; tmp_state = tmp_state + StateArr[s]; } } if(!tmp_mention.equals("")) { Split_mention.add(tmp_mention); Split_state.add(tmp_state); } //Split B/F for(int m=0;m strainsX = new ArrayList(); ArrayList STAstrainsX = new ArrayList(); String each_token[] = Split_mention.get(m).split(" "); String each_state[] = Split_state.get(m).split(""); for(int s=0;s strainsCN = new ArrayList(); String CorN=""; String each_token[] = Split_mention.get(m).split(" ",-1); String each_state[] = Split_state.get(m).split("",-1); for(int k=0;k=4) { A=A.replace("s $", ""); } A=A+"STRAINXXX"; strainCN=strainCN+each_token[k]+" "; CNO_continous=0; } else if(each_state[k].matches("[CN]") && CNO_continous==0) { CorN=each_state[k]; strainsCN.add(strainCN); strainCN=""; CNO_continous++; } else if(each_state[k].equals("J")) { if(!strainCN.equals("")){strainsCN.add(strainCN);} A=A.replaceAll("STRAINXXXSTRAINXXX","STRAINXXX"); A=A.replaceAll("STRAINXXXSTRAINXXX","STRAINXXX"); ptmp1 = Pattern.compile("^(.+)s (.*)$"); mtmp1 = ptmp1.matcher(A); if(mtmp1.find() && mtmp1.group(1).length()>=3 ) { A = mtmp1.group(1)+ " "+mtmp1.group(2); } if(CorN.equals("C")) { for(int x=0;x2 && (tmp.substring(tmp.length()-2, tmp.length()-2).equals(" "))) { tmp = tmp.substring(0,tmp.length()-2); } if(Output_Split_mention_Ind.containsKey(MNoSpace)) { Output_Split_mention_Ind.put(MNoSpace, Output_Split_mention_Ind.get(MNoSpace)+"|"+tmp); } else { Output_Split_mention_Ind.put(MNoSpace, tmp); } } } else if(CorN.equals("N")) { if(strainsCN.contains(0) && strainsCN.contains(1)) { String strain1= strainsCN.get(0).replaceAll(" ", ""); String strain2= strainsCN.get(1).replaceAll(" ", ""); if(strain1.matches("[0-9]+") && strain2.matches("[0-9]+")) { if(Integer.parseInt(strain2)-Integer.parseInt(strain1)<=20) { for(int strCount=Integer.parseInt(strain1);strCount<=Integer.parseInt(strain2);strCount++) { String tmp=A; tmp = tmp.replace("STRAINXXX", Integer.toString(strCount)); tmp = tmp.replaceAll("[ ]+"," "); if(tmp.length()>2 && tmp.substring(tmp.length()-2, tmp.length()-2).equals(" ")) { tmp = tmp.substring(0,tmp.length()-2); } if(Output_Split_mention_Ind.containsKey(MNoSpace)) { Output_Split_mention_Ind.put(MNoSpace, Output_Split_mention_Ind.get(MNoSpace)+"|"+tmp); } else { Output_Split_mention_Ind.put(MNoSpace, tmp); } } } } else if(strain1.matches("[A-Z]+ ") && strain2.matches("[A-Z]+ ")) { int strInt1 = (int) strain1.replaceAll(" ", "").charAt(0); int strInt2 = (int) strain2.replaceAll(" ", "").charAt(0); if(strInt2-strInt1<=20) { for(int strCount=strInt1;strCount<=strInt2;strCount++) { String tmp=A; tmp = tmp.replace("STRAINXXX", Integer.toString(strCount)); tmp = tmp.replaceAll("[ ]+"," "); if(tmp.length()>2 && tmp.substring(tmp.length()-2, tmp.length()-2).equals(" ")) { tmp = tmp.substring(0,tmp.length()-2); } if(Output_Split_mention_Ind.containsKey(MNoSpace)) { Output_Split_mention_Ind.put(MNoSpace, Output_Split_mention_Ind.get(MNoSpace)+"|"+tmp); } else { Output_Split_mention_Ind.put(MNoSpace, tmp); } } } } else { if(Output_Split_mention.containsKey(MNoSpace)) { Output_Split_mention.put(MNoSpace, Output_Split_mention.get(MNoSpace)+"|"+Split_mention.get(m)); } else { Output_Split_mention.put(MNoSpace, Split_mention.get(m)); } } } } else { if(Output_Split_mention.containsKey(MNoSpace)) { Output_Split_mention.put(MNoSpace, Output_Split_mention.get(MNoSpace)+"|"+Split_mention.get(m)); } else { Output_Split_mention.put(MNoSpace, Split_mention.get(m)); } } A=""; strainCN=""; CNO_continous=0; strainsCN = new ArrayList(); CorN=""; } } if(!strainCN.equals("")){strainsCN.add(strainCN);} A=A.replaceAll("(STRAINXXX){2,}","STRAINXXX"); ptmp1 = Pattern.compile("^(.+)s (.*)$"); mtmp1 = ptmp1.matcher(A); if(mtmp1.find() && mtmp1.group(1).length()>=3 ) { A = mtmp1.group(1)+ " "+mtmp1.group(2); } if(CorN.equals("C")) { for(int x=0;x2 && (tmp.substring(tmp.length()-2, tmp.length()-2).equals(" "))) { tmp = tmp.substring(0,tmp.length()-2); } if(Output_Split_mention_Ind.containsKey(MNoSpace)) { Output_Split_mention_Ind.put(MNoSpace, Output_Split_mention_Ind.get(MNoSpace)+"|"+tmp); } else { Output_Split_mention_Ind.put(MNoSpace, tmp); } } } else if(CorN.equals("N")) { if(strainsCN.size()==2) { String strain1= strainsCN.get(0).replaceAll(" ", ""); String strain2= strainsCN.get(1).replaceAll(" ", ""); if(strain1.matches("[0-9]{1,7}") && strain2.matches("[0-9]{1,7}")) { if(Integer.parseInt(strain2)-Integer.parseInt(strain1)<=20) { for(int strCount=Integer.parseInt(strain1);strCount<=Integer.parseInt(strain2);strCount++) { String tmp=A; tmp = tmp.replace("STRAINXXX", Integer.toString(strCount)); tmp = tmp.replaceAll("[ ]+"," "); if(tmp.length()>2 && tmp.substring(tmp.length()-2, tmp.length()-2).equals(" ")) { tmp = tmp.substring(0,tmp.length()-2); } if(Output_Split_mention_Ind.containsKey(MNoSpace)) { Output_Split_mention_Ind.put(MNoSpace, Output_Split_mention_Ind.get(MNoSpace)+"|"+tmp); } else { Output_Split_mention_Ind.put(MNoSpace, tmp); } } } } else if(strain1.matches("[A-Z]+ ") && strain2.matches("[A-Z]+ ")) { int strInt1 = (int) strain1.replaceAll(" ", "").charAt(0); int strInt2 = (int) strain2.replaceAll(" ", "").charAt(0); if(strInt2-strInt1<=20) { for(int strCount=strInt1;strCount<=strInt2;strCount++) { String tmp=A; tmp = tmp.replace("STRAINXXX", Integer.toString(strCount)); tmp = tmp.replaceAll("[ ]+"," "); if(tmp.length()>2 && tmp.substring(tmp.length()-2, tmp.length()-2).equals(" ")) { tmp = tmp.substring(0,tmp.length()-2); } if(Output_Split_mention_Ind.containsKey(MNoSpace)) { Output_Split_mention_Ind.put(MNoSpace, Output_Split_mention_Ind.get(MNoSpace)+"|"+tmp); } else { Output_Split_mention_Ind.put(MNoSpace, tmp); } } } } else { if(Output_Split_mention.containsKey(MNoSpace)) { Output_Split_mention.put(MNoSpace, Output_Split_mention.get(MNoSpace)+"|"+Split_mention.get(m)); } else { Output_Split_mention.put(MNoSpace, Split_mention.get(m)); } } } } else { if(Output_Split_mention.containsKey(MNoSpace)) { Output_Split_mention.put(MNoSpace, Output_Split_mention.get(MNoSpace)+"|"+Split_mention.get(m)); } else { Output_Split_mention.put(MNoSpace, Split_mention.get(m)); } } } } for (int i = 0; i < GNormPlus.BioCDocobj.Annotations.size(); i++) { for (int j = 0; j < GNormPlus.BioCDocobj.Annotations.get(i).size(); j++) { int Annotation_Num = GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); for (int k = 0; k < Annotation_Num ; k++) // k : Annotations { String anno[]=GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\\t"); //Mention String MenArr[]=anno[2].split("\\|"); for(int m=0;m Mentions = new ArrayList(); for(int m=0;m ii // ii --> 2 for (int i = 0; i < GNormPlus.BioCDocobj.Annotations.size(); i++) { for (int j = 0; j < GNormPlus.BioCDocobj.Annotations.get(i).size(); j++) { int Annotation_Num = GNormPlus.BioCDocobj.Annotations.get(i).get(j).size(); for (int k = 0; k < Annotation_Num ; k++) // k : Annotations { String anno[]=GNormPlus.BioCDocobj.Annotations.get(i).get(j).get(k).split("\\t"); //Mention String MenArr[]=anno[2].split("\\|"); HashMap Mentions = new HashMap(); for(int m=0;m