/** * Project: GNormPlus * Function: Data storage in BioC format */ package GNormPluslib; import bioc.BioCAnnotation; import bioc.BioCCollection; import bioc.BioCDocument; import bioc.BioCLocation; import bioc.BioCPassage; import bioc.io.BioCDocumentWriter; import bioc.io.BioCFactory; import bioc.io.woodstox.ConnectorWoodstox; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.UnsupportedEncodingException; import java.time.LocalDate; import java.time.ZoneId; import javax.xml.stream.XMLStreamException; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.ArrayList; import java.util.HashMap; import java.util.List; public class BioCDoc { /* * Contexts in BioC file */ public ArrayList PMIDs=new ArrayList(); // Type: PMIDs public ArrayList> PassageNames = new ArrayList(); // PassageName public ArrayList> PassageOffsets = new ArrayList(); // PassageOffset public ArrayList> PassageContexts = new ArrayList(); // PassageContext public ArrayList>> Annotations = new ArrayList(); // Annotation - GNormPlus public String BioCFormatCheck(String InputFile) throws IOException { ConnectorWoodstox connector = new ConnectorWoodstox(); BioCCollection collection = new BioCCollection(); try { collection = connector.startRead(new InputStreamReader(new FileInputStream(InputFile), "UTF-8")); } catch (UnsupportedEncodingException | FileNotFoundException | XMLStreamException e) { BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(InputFile), "UTF-8")); String line=""; String status=""; String Pmid = ""; boolean tiabs=false; Pattern patt = Pattern.compile("^([^\\|\\t]+)\\|([^\\|\\t]+)\\|(.*)$"); while ((line = br.readLine()) != null) { Matcher mat = patt.matcher(line); if(mat.find()) //Title|Abstract { if(Pmid.equals("")) { Pmid = mat.group(1); } else if(!Pmid.equals(mat.group(1))) { return "[Error]: "+InputFile+" - A blank is needed between "+Pmid+" and "+mat.group(1)+"."; } status = "tiabs"; tiabs = true; } else if (line.contains("\t")) //Annotation { } else if(line.length()==0) //Processing { if(status.equals("")) { if(Pmid.equals("")) { return "[Error]: "+InputFile+" - It's neither BioC nor PubTator format. PMID is empty."; } else { return "[Error]: "+InputFile+" - A redundant blank is after "+Pmid+"."; } } Pmid=""; status=""; } } br.close(); if(tiabs == false) { return "[Error]: "+InputFile+" - It's neither BioC nor PubTator format."; } if(status.equals("")) { return "PubTator"; } else { return "[Error]: "+InputFile+" - The last column missed a blank."; } } return "BioC"; } public void PubTator2BioC(String input,String output) throws IOException, XMLStreamException // Input { /* * PubTator2BioC */ String parser = BioCFactory.WOODSTOX; BioCFactory factory = BioCFactory.newFactory(parser); BioCDocumentWriter BioCOutputFormat = factory.createBioCDocumentWriter(new OutputStreamWriter(new FileOutputStream(output), "UTF-8")); BioCCollection biocCollection = new BioCCollection(); //time ZoneId zonedId = ZoneId.of( "America/Montreal" ); LocalDate today = LocalDate.now( zonedId ); biocCollection.setDate(today.toString()); biocCollection.setKey("BioC.key");//key biocCollection.setSource("GNormPlus");//source BioCOutputFormat.writeCollectionInfo(biocCollection); BufferedReader inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(input), "UTF-8")); ArrayList ParagraphType=new ArrayList(); // Type: Title|Abstract ArrayList ParagraphContent = new ArrayList(); // Text ArrayList annotations = new ArrayList(); // Annotation String line; String Pmid=""; while ((line = inputfile.readLine()) != null) { if(line.contains("|") && !line.contains("\t")) //Title|Abstract { String str[]=line.split("\\|",-1); Pmid=str[0]; if(str[1].equals("t")) { str[1]="title"; } if(str[1].equals("a")) { str[1]="abstract"; } ParagraphType.add(str[1]); if(str.length==3) { String txt = str[2]; txt = txt.replaceAll("ω","w"); txt = txt.replaceAll("μ","u"); txt = txt.replaceAll("κ","k"); txt = txt.replaceAll("α","a"); txt = txt.replaceAll("γ","g"); txt = txt.replaceAll("ɣ","g"); txt = txt.replaceAll("β","b"); txt = txt.replaceAll("×","x"); txt = txt.replaceAll("‑","-"); txt = txt.replaceAll("¹","1"); txt = txt.replaceAll("²","2"); txt = txt.replaceAll("°","o"); txt = txt.replaceAll("ö","o"); txt = txt.replaceAll("é","e"); txt = txt.replaceAll("à","a"); txt = txt.replaceAll("Á","A"); txt = txt.replaceAll("ε","e"); txt = txt.replaceAll("θ","O"); txt = txt.replaceAll("•","."); txt = txt.replaceAll("µ","u"); txt = txt.replaceAll("λ","r"); txt = txt.replaceAll("⁺","+"); txt = txt.replaceAll("ν","v"); txt = txt.replaceAll("ï","i"); txt = txt.replaceAll("ã","a"); txt = txt.replaceAll("≡","="); txt = txt.replaceAll("ó","o"); txt = txt.replaceAll("³","3"); txt = txt.replaceAll("〖","["); txt = txt.replaceAll("〗","]"); txt = txt.replaceAll("Å","A"); txt = txt.replaceAll("ρ","p"); txt = txt.replaceAll("ü","u"); txt = txt.replaceAll("ɛ","e"); txt = txt.replaceAll("č","c"); txt = txt.replaceAll("š","s"); txt = txt.replaceAll("ß","b"); txt = txt.replaceAll("═","="); txt = txt.replaceAll("£","L"); txt = txt.replaceAll("Ł","L"); txt = txt.replaceAll("ƒ","f"); txt = txt.replaceAll("ä","a"); txt = txt.replaceAll("–","-"); txt = txt.replaceAll("⁻","-"); txt = txt.replaceAll("〈","<"); txt = txt.replaceAll("〉",">"); txt = txt.replaceAll("χ","X"); txt = txt.replaceAll("Đ","D"); txt = txt.replaceAll("‰","%"); txt = txt.replaceAll("·","."); txt = txt.replaceAll("→",">"); txt = txt.replaceAll("←","<"); txt = txt.replaceAll("ζ","z"); txt = txt.replaceAll("π","p"); txt = txt.replaceAll("τ","t"); txt = txt.replaceAll("ξ","X"); txt = txt.replaceAll("η","h"); txt = txt.replaceAll("ø","0"); txt = txt.replaceAll("Δ","D"); txt = txt.replaceAll("∆","D"); txt = txt.replaceAll("∑","S"); txt = txt.replaceAll("Ω","O"); txt = txt.replaceAll("δ","d"); txt = txt.replaceAll("σ","s"); txt = txt.replaceAll("Φ","F"); txt = txt.replaceAll("[^\\~\\!\\@\\#\\$\\%\\^\\&\\*\$\$\\_\\+\\{\\}\\|\\:\"\\<\\>\\?\\`\\-\\=\\[\\]\\;\\'\\,\\.\\/\\r\\n0-9a-zA-Z ]"," "); ParagraphContent.add(txt); } else { ParagraphContent.add("- No text -"); } } else if (line.contains("\t")) //Annotation { String anno[]=line.split("\t"); if(anno.length==6) { annotations.add(anno[1]+"\t"+anno[2]+"\t"+anno[3]+"\t"+anno[4]+"\t"+anno[5]); } else if(anno.length==5) { annotations.add(anno[1]+"\t"+anno[2]+"\t"+anno[3]+"\t"+anno[4]); } } else if(line.length()==0) //Processing { BioCDocument biocDocument = new BioCDocument(); biocDocument.setID(Pmid); int startoffset=0; for(int i=0;i Infons = new HashMap(); Infons.put("type", ParagraphType.get(i)); biocPassage.setInfons(Infons); biocPassage.setText(ParagraphContent.get(i)); biocPassage.setOffset(startoffset); startoffset=startoffset+ParagraphContent.get(i).length()+1; for(int j=0;j=startoffset-ParagraphContent.get(i).length()-1) { BioCAnnotation biocAnnotation = new BioCAnnotation(); Map AnnoInfons = new HashMap(); if(anno.length==5) { AnnoInfons.put("Identifier", anno[4]); } AnnoInfons.put("type", anno[3]); biocAnnotation.setInfons(AnnoInfons); BioCLocation location = new BioCLocation(); location.setOffset(Integer.parseInt(anno[0])); location.setLength(Integer.parseInt(anno[1])-Integer.parseInt(anno[0])); biocAnnotation.setLocation(location); biocAnnotation.setText(anno[2]); biocPassage.addAnnotation(biocAnnotation); } } biocDocument.addPassage(biocPassage); } biocCollection.addDocument(biocDocument); ParagraphType.clear(); ParagraphContent.clear(); annotations.clear(); BioCOutputFormat.writeDocument(biocDocument); } } BioCOutputFormat.close(); inputfile.close(); } public void BioC2PubTator(String input,String output) throws IOException, XMLStreamException //Output { /* * BioC2PubTator */ HashMap pmidlist = new HashMap(); // check if appear duplicate pmids boolean duplicate = false; BufferedWriter PubTatorOutputFormat = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(output), "UTF-8")); ConnectorWoodstox connector = new ConnectorWoodstox(); BioCCollection collection = new BioCCollection(); collection = connector.startRead(new InputStreamReader(new FileInputStream(input), "UTF-8")); while (connector.hasNext()) { BioCDocument document = connector.next(); String PMID = document.getID(); if(pmidlist.containsKey(PMID)){System.out.println("\nError: duplicate pmid-"+PMID);duplicate = true;} else{pmidlist.put(PMID,"");} String Anno=""; for (BioCPassage passage : document.getPassages()) { if(passage.getInfon("type").equals("title")) { PubTatorOutputFormat.write(PMID+"|t|"+passage.getText()+"\n"); } else if(passage.getInfon("type").equals("abstract")) { PubTatorOutputFormat.write(PMID+"|a|"+passage.getText()+"\n"); } else { PubTatorOutputFormat.write(PMID+"|"+passage.getInfon("type")+"|"+passage.getText()+"\n"); } for (BioCAnnotation annotation : passage.getAnnotations()) { String Annotype = annotation.getInfon("type"); String Annoid=""; String Proteinid=""; if(Annotype.matches("(Gene|FamilyName|DomainMotif)")) { if(annotation.getInfons().containsKey("NCBI Gene")) { Annoid = annotation.getInfon("NCBI Gene"); String Annoidlist[]=Annoid.split(";"); Annoid=""; for(int x=0;x ParagraphContent = new HashMap(); // [PMID,0] -> title HashMap annotations = new HashMap(); // PMID ->Annotation String line; String Pmid=""; int count_paragraph=0; while ((line = inputfile.readLine()) != null) { if(line.contains("|") && !line.contains("\t")) //Title|Abstract { String str[]=line.split("\\|",-1); Pmid=str[0]; ParagraphContent.put(Pmid+"\t"+str[1],str[2]); count_paragraph++; } else if (line.contains("\t")) //Annotation { annotations.put(Pmid, annotations.get(Pmid)+line); } else if(line.length()==0) //Processing { count_paragraph=0; } } inputfile.close(); /* * BioC2PubTator */ HashMap pmidlist = new HashMap(); // check if appear duplicate pmids boolean duplicate = false; BufferedWriter PubTatorOutputFormat = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(output), "UTF-8")); ConnectorWoodstox connector = new ConnectorWoodstox(); BioCCollection collection = new BioCCollection(); collection = connector.startRead(new InputStreamReader(new FileInputStream(input), "UTF-8")); while (connector.hasNext()) { BioCDocument document = connector.next(); String PMID = document.getID(); if(pmidlist.containsKey(PMID)){System.out.println("\nError: duplicate pmid-"+PMID);duplicate = true;} else{pmidlist.put(PMID,"");} String Anno=""; for (BioCPassage passage : document.getPassages()) { if(passage.getInfon("type").equals("title") || passage.getInfon("type").equals("t")) { PubTatorOutputFormat.write(PMID+"|t|"+ParagraphContent.get(PMID+"\tt")+"\n"); } else if(passage.getInfon("type").equals("abstract") || passage.getInfon("type").equals("a")) { PubTatorOutputFormat.write(PMID+"|a|"+ParagraphContent.get(PMID+"\ta")+"\n"); } else { PubTatorOutputFormat.write(PMID+"|"+passage.getInfon("type")+"|"+passage.getText()+"\n"); } for (BioCAnnotation annotation : passage.getAnnotations()) { String Annotype = annotation.getInfon("type"); String Annoid=""; String Proteinid=""; if(Annotype.matches("(Gene|FamilyName|DomainMotif)")) { if(annotation.getInfons().containsKey("NCBI Gene")) { Annoid = annotation.getInfon("NCBI Gene"); String Annoidlist[]=Annoid.split(";"); Annoid=""; for(int x=0;x PassageName= new ArrayList(); // array of Passage name ArrayList PassageOffset= new ArrayList(); // array of Passage offset ArrayList PassageContext= new ArrayList(); // array of Passage context ArrayList> AnnotationInPMID= new ArrayList(); // array of Annotations in the PassageName /* * Per Passage */ for (BioCPassage passage : document.getPassages()) { PassageName.add(passage.getInfon("type")); //Paragraph String txt = passage.getText(); if(txt.matches("[\t ]+")) { txt = txt.replaceAll(".","@"); } else { //if(passage.getInfon("type").toLowerCase().equals("table")) //{ // txt=txt.replaceAll(" ", "|"); //} txt = txt.replaceAll("ω","w"); txt = txt.replaceAll("μ","u"); txt = txt.replaceAll("κ","k"); txt = txt.replaceAll("α","a"); txt = txt.replaceAll("γ","g"); txt = txt.replaceAll("ɣ","g"); txt = txt.replaceAll("β","b"); txt = txt.replaceAll("×","x"); txt = txt.replaceAll("‑","-"); txt = txt.replaceAll("¹","1"); txt = txt.replaceAll("²","2"); txt = txt.replaceAll("°","o"); txt = txt.replaceAll("ö","o"); txt = txt.replaceAll("é","e"); txt = txt.replaceAll("à","a"); txt = txt.replaceAll("Á","A"); txt = txt.replaceAll("ε","e"); txt = txt.replaceAll("θ","O"); txt = txt.replaceAll("•","."); txt = txt.replaceAll("µ","u"); txt = txt.replaceAll("λ","r"); txt = txt.replaceAll("⁺","+"); txt = txt.replaceAll("ν","v"); txt = txt.replaceAll("ï","i"); txt = txt.replaceAll("ã","a"); txt = txt.replaceAll("≡","="); txt = txt.replaceAll("ó","o"); txt = txt.replaceAll("³","3"); txt = txt.replaceAll("〖","["); txt = txt.replaceAll("〗","]"); txt = txt.replaceAll("Å","A"); txt = txt.replaceAll("ρ","p"); txt = txt.replaceAll("ü","u"); txt = txt.replaceAll("ɛ","e"); txt = txt.replaceAll("č","c"); txt = txt.replaceAll("š","s"); txt = txt.replaceAll("ß","b"); txt = txt.replaceAll("═","="); txt = txt.replaceAll("£","L"); txt = txt.replaceAll("Ł","L"); txt = txt.replaceAll("ƒ","f"); txt = txt.replaceAll("ä","a"); txt = txt.replaceAll("–","-"); txt = txt.replaceAll("⁻","-"); txt = txt.replaceAll("〈","<"); txt = txt.replaceAll("〉",">"); txt = txt.replaceAll("χ","X"); txt = txt.replaceAll("Đ","D"); txt = txt.replaceAll("‰","%"); txt = txt.replaceAll("·","."); txt = txt.replaceAll("→",">"); txt = txt.replaceAll("←","<"); txt = txt.replaceAll("ζ","z"); txt = txt.replaceAll("π","p"); txt = txt.replaceAll("τ","t"); txt = txt.replaceAll("ξ","X"); txt = txt.replaceAll("η","h"); txt = txt.replaceAll("ø","0"); txt = txt.replaceAll("Δ","D"); txt = txt.replaceAll("∆","D"); txt = txt.replaceAll("∑","S"); txt = txt.replaceAll("Ω","O"); txt = txt.replaceAll("δ","d"); txt = txt.replaceAll("σ","s"); txt = txt.replaceAll("Φ","F"); //txt = txt.replaceAll("[^\\~\\!\\@\\#\\$\\%\\^\\&\\*\$\$\\_\\+\\{\\}\\|\\:\"\\<\\>\\?\\`\\-\\=\\[\\]\\;\\'\\,\\.\\/\\r\\n0-9a-zA-Z ]"," "); } if(passage.getText().equals("") || passage.getText().matches("[ ]+")) { PassageContext.add("-notext-"); //Context } else { PassageContext.add(txt); //Context } PassageOffset.add(passage.getOffset()); //Offset ArrayList AnnotationInPassage= new ArrayList(); // array of Annotations in the PassageName AnnotationInPMID.add(AnnotationInPassage); } PassageNames.add(PassageName); PassageContexts.add(PassageContext); PassageOffsets.add(PassageOffset); Annotations.add(AnnotationInPMID); } } public void BioCReaderWithAnnotation(String input) throws IOException, XMLStreamException { ConnectorWoodstox connector = new ConnectorWoodstox(); BioCCollection collection = new BioCCollection(); collection = connector.startRead(new InputStreamReader(new FileInputStream(input), "UTF-8")); /* * Per document */ while (connector.hasNext()) { BioCDocument document = connector.next(); PMIDs.add(document.getID()); ArrayList PassageName= new ArrayList(); // array of Passage name ArrayList PassageOffset= new ArrayList(); // array of Passage offset ArrayList PassageContext= new ArrayList(); // array of Passage context ArrayList> AnnotationInPMID= new ArrayList(); // array of Annotations in the PassageName /* * Per Passage */ for (BioCPassage passage : document.getPassages()) { PassageName.add(passage.getInfon("type")); //Paragraph String txt = passage.getText(); if(txt.matches("[\t ]+")) { txt = txt.replaceAll(".","@"); } else { //if(passage.getInfon("type").toLowerCase().equals("table")) //{ // txt=txt.replaceAll(" ", "|"); //} txt = txt.replaceAll("ω","w"); txt = txt.replaceAll("μ","u"); txt = txt.replaceAll("κ","k"); txt = txt.replaceAll("α","a"); txt = txt.replaceAll("γ","g"); txt = txt.replaceAll("ɣ","g"); txt = txt.replaceAll("β","b"); txt = txt.replaceAll("×","x"); txt = txt.replaceAll("‑","-"); txt = txt.replaceAll("¹","1"); txt = txt.replaceAll("²","2"); txt = txt.replaceAll("°","o"); txt = txt.replaceAll("ö","o"); txt = txt.replaceAll("é","e"); txt = txt.replaceAll("à","a"); txt = txt.replaceAll("Á","A"); txt = txt.replaceAll("ε","e"); txt = txt.replaceAll("θ","O"); txt = txt.replaceAll("•","."); txt = txt.replaceAll("µ","u"); txt = txt.replaceAll("λ","r"); txt = txt.replaceAll("⁺","+"); txt = txt.replaceAll("ν","v"); txt = txt.replaceAll("ï","i"); txt = txt.replaceAll("ã","a"); txt = txt.replaceAll("≡","="); txt = txt.replaceAll("ó","o"); txt = txt.replaceAll("³","3"); txt = txt.replaceAll("〖","["); txt = txt.replaceAll("〗","]"); txt = txt.replaceAll("Å","A"); txt = txt.replaceAll("ρ","p"); txt = txt.replaceAll("ü","u"); txt = txt.replaceAll("ɛ","e"); txt = txt.replaceAll("č","c"); txt = txt.replaceAll("š","s"); txt = txt.replaceAll("ß","b"); txt = txt.replaceAll("═","="); txt = txt.replaceAll("£","L"); txt = txt.replaceAll("Ł","L"); txt = txt.replaceAll("ƒ","f"); txt = txt.replaceAll("ä","a"); txt = txt.replaceAll("–","-"); txt = txt.replaceAll("⁻","-"); txt = txt.replaceAll("〈","<"); txt = txt.replaceAll("〉",">"); txt = txt.replaceAll("χ","X"); txt = txt.replaceAll("Đ","D"); txt = txt.replaceAll("‰","%"); txt = txt.replaceAll("·","."); txt = txt.replaceAll("→",">"); txt = txt.replaceAll("←","<"); txt = txt.replaceAll("ζ","z"); txt = txt.replaceAll("π","p"); txt = txt.replaceAll("τ","t"); txt = txt.replaceAll("ξ","X"); txt = txt.replaceAll("η","h"); txt = txt.replaceAll("ø","0"); txt = txt.replaceAll("Δ","D"); txt = txt.replaceAll("∆","D"); txt = txt.replaceAll("∑","S"); txt = txt.replaceAll("Ω","O"); txt = txt.replaceAll("δ","d"); txt = txt.replaceAll("σ","s"); txt = txt.replaceAll("Φ","F"); //txt = txt.replaceAll("[^\\~\\!\\@\\#\\$\\%\\^\\&\\*\$\$\\_\\+\\{\\}\\|\\:\"\\<\\>\\?\\`\\-\\=\\[\\]\\;\\'\\,\\.\\/\\r\\n0-9a-zA-Z ]"," "); } if(passage.getText().equals("") || passage.getText().matches("[ ]+")) { PassageContext.add("-notext-"); //Context } else { PassageContext.add(txt); //Context } PassageOffset.add(passage.getOffset()); //Offset ArrayList AnnotationInPassage= new ArrayList(); // array of Annotations in the PassageName /* * Per Annotation : * start * last * mention * type * id */ for (BioCAnnotation Anno : passage.getAnnotations()) { int start = Anno.getLocations().get(0).getOffset()-passage.getOffset(); // start int last = start + Anno.getLocations().get(0).getLength(); // last String AnnoMention=Anno.getText(); // mention String Annotype = Anno.getInfon("type"); // type String Annoid = Anno.getInfon("Identifier"); // identifier | MESH if(Annoid == null) { Annoid = Anno.getInfon("Identifier"); // identifier | MESH } if(Annoid == null || Annoid.equals("null")) { AnnotationInPassage.add(start+"\t"+last+"\t"+AnnoMention+"\t"+Annotype); //paragraph } else { AnnotationInPassage.add(start+"\t"+last+"\t"+AnnoMention+"\t"+Annotype+"\t"+Annoid); //paragraph } } AnnotationInPMID.add(AnnotationInPassage); } PassageNames.add(PassageName); PassageContexts.add(PassageContext); PassageOffsets.add(PassageOffset); Annotations.add(AnnotationInPMID); } } public void BioCOutput(String input,String output, ArrayList>> Annotations,boolean Final,boolean RemovePreviousAnno) throws IOException, XMLStreamException { boolean ShowUnNormalizedMention = false; if(GNormPlus.setup_hash.containsKey("ShowUnNormalizedMention") && GNormPlus.setup_hash.get("ShowUnNormalizedMention").equals("True")) { ShowUnNormalizedMention = true; } BioCDocumentWriter BioCOutputFormat = BioCFactory.newFactory(BioCFactory.WOODSTOX).createBioCDocumentWriter(new OutputStreamWriter(new FileOutputStream(output), "UTF-8")); BioCCollection biocCollection_input = new BioCCollection(); BioCCollection biocCollection_output = new BioCCollection(); //input: BioC ConnectorWoodstox connector = new ConnectorWoodstox(); biocCollection_input = connector.startRead(new InputStreamReader(new FileInputStream(input), "UTF-8")); BioCOutputFormat.writeCollectionInfo(biocCollection_input); int i=0; //count for pmid while (connector.hasNext()) { BioCDocument document_output = new BioCDocument(); BioCDocument document_input = connector.next(); String PMID=document_input.getID(); document_output.setID(PMID); int annotation_count=0; int j=0; //count for paragraph for (BioCPassage passage_input : document_input.getPassages()) { BioCPassage passage_output = passage_input; if(RemovePreviousAnno == true) //clean the previous annotation, if the NER result is provided { passage_output.clearAnnotations(); } else { for (BioCAnnotation annotation : passage_output.getAnnotations()) { annotation.setID(""+annotation_count); annotation_count++; } } int passage_Offset = passage_input.getOffset(); String passage_Text = passage_input.getText(); ArrayList AnnotationInPassage = new ArrayList(); //ArrayList AnnotationInPassage = Annotations.get(i).get(j); if(Annotations.size()>i && Annotations.get(i).size()>j) { for(int a=0;alast) { String mention = Anno[2]; if(Final == true && passage_Text.length()>=last) { mention = passage_Text.substring(start, last); } if(mention.matches(".*\t.*")) { Anno[3]=Anno[4]; if(Anno.length>=6) { Anno[4]=Anno[5]; } } String type = Anno[3]; String id = ""; // optional if(Anno.length>=5){id = Anno[4];} if(Final == true) { for(int b=0;b=lastb) { mentionb = passage_Text.substring(startb, lastb); } if(mentionb.matches(".*\t.*")) { Annob[3]=Annob[4]; if(Annob.length>=6) { Annob[4]=Annob[5]; } } String typeb = Annob[3]; String idb = ""; // optional if(Annob.length>=5){idb = Annob[4];} if(start == startb && last == lastb && type.equals(typeb)) { found = true; if(id.matches("(Focus|Right|Left|Prefix|GeneID|Tax):[0-9]+") && (!idb.equals(""))) { } else if(idb.matches("(Focus|Right|Left|Prefix|GeneID|Tax):[0-9]+") && (!id.matches("(Focus|Right|Left|Prefix|GeneID|Tax):[0-9]+")) && (!id.equals(""))) { AnnotationInPassage.set(b, start+"\t"+last+"\t"+mention+"\t"+type+"\t"+id); } else { if(id.equals("")) { } else { AnnotationInPassage.set(b, start+"\t"+last+"\t"+mention+"\t"+type+"\t"+idb+";"+id); } } break; } } } } if(found == false) { AnnotationInPassage.add(Annotations.get(i).get(j).get(a)); } } } for(int a=0;a id_hash = new HashMap (); if(Anno.length>=5) { int start = Integer.parseInt(Anno[0]); int last = Integer.parseInt(Anno[1]); String mention = Anno[2]; if(Final == true && passage_Text.length()>=last) { mention = passage_Text.substring(start, last); } if(mention.matches(".*\t.*")) { Anno[3]=Anno[4]; if(Anno.length>=6) { Anno[4]=Anno[5]; } } String ids = Anno[4]; String idlist[]=ids.split(","); for(int b=0;blast) { String mention = Anno[2]; if(Final == true && passage_Text.length()>=last) { mention = passage_Text.substring(start, last); } if(mention.matches(".*\t.*")) { Anno[3]=Anno[4]; if(Anno.length>=6) { Anno[4]=Anno[5]; } } String type = Anno[3]; if(type.equals("GeneID")){type="Gene";} BioCAnnotation biocAnnotation = new BioCAnnotation(); Map AnnoInfons = new HashMap(); AnnoInfons.put("type", type); if(Anno.length>=5) { String identifier = Anno[4]; if(Final == true && ShowUnNormalizedMention==false) { if(type.matches("(FamilyName|Domain|Gene)")) { Pattern ptmp0 = Pattern.compile("^(Focus|Right|Left|Prefix|GeneID|Tax)\\:([0-9]+)\\|([0-9\\;]+)$"); Matcher mtmp0 = ptmp0.matcher(identifier); Pattern ptmp1 = Pattern.compile("^(Focus|Right|Left|Prefix|GeneID|Tax)\\:([0-9]+)\\|([0-9]+)\\-([0-9]+)$"); Matcher mtmp1 = ptmp1.matcher(identifier); Pattern ptmp2 = Pattern.compile("^(Focus|Right|Left|Prefix|GeneID|Tax)\\:([0-9]+)$"); Matcher mtmp2 = ptmp2.matcher(identifier); Pattern ptmp3 = Pattern.compile("^Homo\\:([0-9]+)$"); Matcher mtmp3 = ptmp3.matcher(identifier); if(mtmp0.find()) { String Method_SA = mtmp0.group(1); String TaxonomyID = mtmp0.group(2); String NCBIGeneID = mtmp0.group(3); if(GNormPlus.Normalization2Protein_hash.containsKey(NCBIGeneID)) { AnnoInfons.put("UniProt", GNormPlus.Normalization2Protein_hash.get(NCBIGeneID)); } if(GNormPlus.HomologeneID_hash.containsKey(NCBIGeneID)) { AnnoInfons.put("NCBI Homologene", GNormPlus.HomologeneID_hash.get(NCBIGeneID)); } AnnoInfons.put("NCBI Gene", NCBIGeneID); } else if(mtmp1.find()) { String Method_SA = mtmp1.group(1); String TaxonomyID = mtmp1.group(2); String NCBIGeneID = mtmp1.group(3); String HomoID = mtmp1.group(4); if(GNormPlus.Normalization2Protein_hash.containsKey(NCBIGeneID)) { AnnoInfons.put("UniProt", GNormPlus.Normalization2Protein_hash.get(NCBIGeneID)); } if(GNormPlus.HomologeneID_hash.containsKey(NCBIGeneID)) { AnnoInfons.put("NCBI Homologene", GNormPlus.HomologeneID_hash.get(NCBIGeneID)); } AnnoInfons.put("NCBI Gene", NCBIGeneID); } else if(mtmp2.find()) { String Method_SA = mtmp2.group(1); String TaxonomyID = mtmp2.group(2); AnnoInfons.put("FocusSpecies", "NCBITaxonomyID:"+TaxonomyID); } else if(mtmp3.find()) { String Method_SA = mtmp3.group(1); String HomoID = mtmp3.group(2); AnnoInfons.put("NCBI Homologene", HomoID); } else { String identifiers[] = identifier.split(";"); if(identifiers.length>1) { ArrayList identifierSTR = new ArrayList(); ArrayList ProteinidSTR = new ArrayList(); ArrayList HomoidSTR = new ArrayList(); for(int idi=0;idi