|
|
|
|
|
|
|
|
|
|
|
package GNormPluslib; |
|
|
|
import java.io.*; |
|
import java.util.*; |
|
import java.util.regex.Matcher; |
|
import java.util.regex.Pattern; |
|
|
|
public class PrefixTree |
|
{ |
|
private Tree Tr=new Tree(); |
|
|
|
|
|
|
|
|
|
public static HashMap<String, String> StopWord_hash = new HashMap<String, String>(); |
|
|
|
public void Hash2Tree(HashMap<String, String> ID2Names) |
|
{ |
|
for(String ID : ID2Names.keySet()) |
|
{ |
|
String NameColumn[]=ID2Names.get(ID).split("\\|"); |
|
for(int i=0;i<NameColumn.length;i++) |
|
{ |
|
Tr.insertMention(NameColumn[i],ID); |
|
} |
|
} |
|
} |
|
public void Dictionary2Tree_Combine(String Filename,String StopWords,String MentionType) |
|
{ |
|
try |
|
{ |
|
|
|
|
|
|
|
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(StopWords), "UTF-8")); |
|
String line=""; |
|
while ((line = br.readLine()) != null) |
|
{ |
|
StopWord_hash.put(line, "StopWord"); |
|
} |
|
br.close(); |
|
|
|
BufferedReader inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(Filename), "UTF-8")); |
|
line=""; |
|
|
|
while ((line = inputfile.readLine()) != null) |
|
{ |
|
|
|
|
|
String Column[]=line.split("\t"); |
|
if(Column.length>1) |
|
{ |
|
Column[0]=Column[0].replace("species:ncbi:",""); |
|
Column[1]=Column[1].replaceAll(" strain=", " "); |
|
Column[1]=Column[1].replaceAll("[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_]", " "); |
|
Column[1]=Column[1].replaceAll("[\\(\\)]", " "); |
|
String SpNameColumn[]=Column[1].split("\\|"); |
|
for(int i=0;i<SpNameColumn.length;i++) |
|
{ |
|
String tmp = SpNameColumn[i]; |
|
tmp=tmp.replaceAll("[\\W\\-\\_]", ""); |
|
|
|
|
|
|
|
|
|
if( MentionType.equals("Species") && |
|
(!SpNameColumn[i].substring(0, 1).matches("[\\W\\-\\_]")) && |
|
(!SpNameColumn[i].matches("a[\\W\\-\\_].*")) && |
|
tmp.length()>=3 |
|
) |
|
{ |
|
boolean stopword_boolean=false; |
|
for(String stopword_RegEx : StopWord_hash.keySet()) |
|
{ |
|
Pattern ptmp = Pattern.compile("^"+stopword_RegEx+"$"); |
|
Matcher mtmp = ptmp.matcher(SpNameColumn[i].toLowerCase()); |
|
if(mtmp.find()) |
|
{ |
|
stopword_boolean=true; |
|
} |
|
} |
|
if(stopword_boolean == false) |
|
{ |
|
Tr.insertMention(SpNameColumn[i],Column[0]); |
|
} |
|
} |
|
|
|
|
|
|
|
else if (MentionType.equals("Gene") && |
|
(!SpNameColumn[i].substring(0, 1).matches("[\\W\\-\\_]")) && |
|
tmp.length()>=3 |
|
) |
|
{ |
|
if(!StopWord_hash.containsKey(SpNameColumn[i].toLowerCase())) |
|
{ |
|
Tr.insertMention(SpNameColumn[i],Column[0]); |
|
} |
|
} |
|
|
|
|
|
|
|
else if (MentionType.equals("Cell") && |
|
(!SpNameColumn[i].substring(0, 1).matches("[\\W\\-\\_]")) && |
|
tmp.length()>=3 |
|
) |
|
{ |
|
if(!StopWord_hash.containsKey(SpNameColumn[i].toLowerCase())) |
|
{ |
|
Tr.insertMention(SpNameColumn[i],Column[0]); |
|
} |
|
} |
|
|
|
|
|
|
|
else if ((!SpNameColumn[i].substring(0, 1).matches("[\\W\\-\\_]")) && |
|
tmp.length()>=3 |
|
) |
|
{ |
|
if(!StopWord_hash.containsKey(SpNameColumn[i].toLowerCase())) |
|
{ |
|
Tr.insertMention(SpNameColumn[i],Column[0]); |
|
} |
|
} |
|
} |
|
} |
|
} |
|
inputfile.close(); |
|
} |
|
catch(IOException e1){ System.out.println("[Dictionary2Tree_Combine]: Input file is not exist.");} |
|
} |
|
public void Dictionary2Tree_UniqueGene(String Filename,String StopWords,String Preifx) |
|
{ |
|
try |
|
{ |
|
|
|
|
|
|
|
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(StopWords), "UTF-8")); |
|
String line=""; |
|
while ((line = br.readLine()) != null) |
|
{ |
|
StopWord_hash.put(line, "StopWord"); |
|
} |
|
br.close(); |
|
|
|
BufferedReader inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(Filename), "UTF-8")); |
|
line=""; |
|
|
|
while ((line = inputfile.readLine()) != null) |
|
{ |
|
|
|
|
|
String Column[]=line.split("\t"); |
|
if(Column.length>1) |
|
{ |
|
if(!StopWord_hash.containsKey(Column[0].toLowerCase())) |
|
{ |
|
if(Preifx.equals("")) |
|
{ |
|
Tr.insertMention(Column[0],Column[1]); |
|
} |
|
else if(Preifx.equals("Num") && Column[0].matches("[0-9].*")) |
|
{ |
|
Tr.insertMention(Column[0],Column[1]); |
|
} |
|
else if(Preifx.equals("AZNum") && Column[0].matches("[a-z][0-9].*")) |
|
{ |
|
Tr.insertMention(Column[0],Column[1]); |
|
} |
|
else if(Preifx.equals("lo") && Column[0].length()>2 && Column[0].substring(0,2).equals(Preifx)) |
|
{ |
|
if( ! Column[0].matches("loc[0-9]+")) |
|
{ |
|
Tr.insertMention(Column[0],Column[1]); |
|
} |
|
} |
|
else if(Preifx.equals("un") && Column[0].length()>2 && Column[0].substring(0,2).equals(Preifx)) |
|
{ |
|
if(Column[0].length()>=6 && Column[0].substring(0,6).equals("unchar")) |
|
{ |
|
|
|
} |
|
else |
|
{ |
|
Tr.insertMention(Column[0],Column[1]); |
|
} |
|
} |
|
else if(Column[0].length()>2 && Column[0].substring(0,2).equals(Preifx)) |
|
{ |
|
Tr.insertMention(Column[0],Column[1]); |
|
} |
|
} |
|
} |
|
} |
|
inputfile.close(); |
|
} |
|
catch(IOException e1){ System.out.println("[Dictionary2Tree_UniqueGene]: Input file is not exist.");} |
|
} |
|
public void Dictionary2Tree_UniqueSpecies(String Filename,String StopWords,String Preifx) |
|
{ |
|
try |
|
{ |
|
|
|
|
|
|
|
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(StopWords), "UTF-8")); |
|
String line=""; |
|
while ((line = br.readLine()) != null) |
|
{ |
|
StopWord_hash.put(line, "StopWord"); |
|
} |
|
br.close(); |
|
|
|
BufferedReader inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(Filename), "UTF-8")); |
|
line=""; |
|
while ((line = inputfile.readLine()) != null) |
|
{ |
|
|
|
|
|
String Column[]=line.split("\t"); |
|
if(Column.length>1) |
|
{ |
|
if(!StopWord_hash.containsKey(Column[0].toLowerCase())) |
|
{ |
|
if(Preifx.equals("")) |
|
{ |
|
if(Column[0].matches(".*[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_].*")) |
|
{ |
|
String mention_rev=Column[0].replaceAll("[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_]", " "); |
|
String mention_tmp=mention_rev.replaceAll("[\\W\\-\\_]",""); |
|
if(mention_tmp.length()>=10) |
|
{ |
|
Tr.insertMention(mention_rev,Column[1]); |
|
} |
|
} |
|
else |
|
{ |
|
Tr.insertMention(Column[0],Column[1]); |
|
} |
|
|
|
} |
|
else if(Column[0].matches("[0-9][0-9].*")) |
|
{ |
|
if(Preifx.equals("Num")) |
|
{ |
|
if(Column[0].matches(".*[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_].*")) |
|
{ |
|
String mention_rev=Column[0].replaceAll("[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_]", " "); |
|
String mention_tmp=mention_rev.replaceAll("[\\W\\-\\_]",""); |
|
if(mention_tmp.length()>=10) |
|
{ |
|
Tr.insertMention(mention_rev,Column[1]); |
|
} |
|
} |
|
else |
|
{ |
|
Tr.insertMention(Column[0],Column[1]); |
|
} |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
else if(Column[0].matches("[a-z][a-z].*")) |
|
{ |
|
if(Column[0].length()>2 && Column[0].substring(0,2).equals(Preifx)) |
|
{ |
|
if(Column[0].matches(".*[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_].*")) |
|
{ |
|
String mention_rev=Column[0].replaceAll("[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_]", " "); |
|
String mention_tmp=mention_rev.replaceAll("[\\W\\-\\_]",""); |
|
if(mention_tmp.length()>=10) |
|
{ |
|
Tr.insertMention(mention_rev,Column[1]); |
|
} |
|
} |
|
else |
|
{ |
|
Tr.insertMention(Column[0],Column[1]); |
|
} |
|
} |
|
} |
|
else if(Preifx.equals("Others")) |
|
{ |
|
if(Column[0].matches(".*[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_].*")) |
|
{ |
|
String mention_rev=Column[0].replaceAll("[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_]", " "); |
|
String mention_tmp=mention_rev.replaceAll("[\\W\\-\\_]",""); |
|
if(mention_tmp.length()>=10) |
|
{ |
|
Tr.insertMention(mention_rev,Column[1]); |
|
} |
|
} |
|
else |
|
{ |
|
Tr.insertMention(Column[0],Column[1]); |
|
} |
|
} |
|
} |
|
} |
|
} |
|
inputfile.close(); |
|
} |
|
catch(IOException e1){ System.out.println("[Dictionary2Tree_UniqueGene]: Input file is not exist.");} |
|
} |
|
public void TreeFile2Tree(String Filename) |
|
{ |
|
try |
|
{ |
|
|
|
|
|
BufferedReader inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(Filename), "UTF-8")); |
|
String line=""; |
|
int count=0; |
|
while ((line = inputfile.readLine()) != null) |
|
{ |
|
String Anno[]=line.split("\t"); |
|
if(Anno.length<2){System.out.println(count+"\t"+line);} |
|
String LocationInTree = Anno[0]; |
|
String token = Anno[1]; |
|
String identifier=""; |
|
if(Anno.length==3) |
|
{ |
|
identifier = Anno[2]; |
|
} |
|
String LocationsInTree[]=LocationInTree.split("-"); |
|
TreeNode tmp = Tr.root; |
|
for(int i=0;i<LocationsInTree.length-1;i++) |
|
{ |
|
tmp=tmp.links.get(Integer.parseInt(LocationsInTree[i])-1); |
|
} |
|
tmp.InsertToken(token,identifier); |
|
|
|
count++; |
|
} |
|
inputfile.close(); |
|
} |
|
catch(IOException e1){ System.out.println("[TreeFile2Tee]: Input file: "+ Filename +" is not exist.");} |
|
} |
|
|
|
|
|
|
|
|
|
public String MentionMatch(String Mentions) |
|
{ |
|
ArrayList<String> location = new ArrayList<String>(); |
|
String Menlist[]=Mentions.split("\\|"); |
|
for(int m=0;m<Menlist.length;m++) |
|
{ |
|
String Mention=Menlist[m]; |
|
String Mention_lc=Mention.toLowerCase(); |
|
Mention_lc = Mention_lc.replaceAll("[\\W\\-\\_]+", ""); |
|
Mention_lc = Mention_lc.replaceAll("([0-9])([a-z])", "$1 $2"); |
|
Mention_lc = Mention_lc.replaceAll("([a-z])([0-9])", "$1 $2"); |
|
String Tkns[]=Mention_lc.split(" "); |
|
|
|
int PrefixTranslation=0; |
|
int i=0; |
|
boolean find=false; |
|
TreeNode tmp = Tr.root; |
|
|
|
while( i<Tkns.length && tmp.CheckChild(Tkns[i],PrefixTranslation)>=0) |
|
{ |
|
if(i == Tkns.length-1){PrefixTranslation = 1;} |
|
tmp=tmp.links.get(tmp.CheckChild(Tkns[i],PrefixTranslation)); |
|
find=true; |
|
i++; |
|
} |
|
if(find == true) |
|
{ |
|
if(i==Tkns.length) |
|
{ |
|
if(!tmp.Concept.equals("")) |
|
{ |
|
return tmp.Concept; |
|
} |
|
else |
|
{ |
|
return "-1"; |
|
|
|
} |
|
} |
|
else |
|
{ |
|
return "-2"; |
|
|
|
} |
|
} |
|
else |
|
{ |
|
return "-3"; |
|
|
|
} |
|
} |
|
return "-3"; |
|
} |
|
|
|
|
|
|
|
|
|
public String MentionMatch_species(String Mentions) |
|
{ |
|
ArrayList<String> location = new ArrayList<String>(); |
|
String Menlist[]=Mentions.split("\\|"); |
|
for(int m=0;m<Menlist.length;m++) |
|
{ |
|
String Mention=Menlist[m]; |
|
String Mention_lc=Mention.toLowerCase(); |
|
Mention_lc = Mention_lc.replaceAll("[\\W\\-\\_]+", " "); |
|
Mention_lc = Mention_lc.replaceAll("([0-9])([a-z])", "$1 $2"); |
|
Mention_lc = Mention_lc.replaceAll("([a-z])([0-9])", "$1 $2"); |
|
Mention_lc = Mention_lc.replaceAll("^[ ]+", ""); |
|
Mention_lc = Mention_lc.replaceAll("[ ]+$", ""); |
|
String Tkns[]=Mention_lc.split(" "); |
|
|
|
int PrefixTranslation=0; |
|
int i=0; |
|
boolean find=false; |
|
TreeNode tmp = Tr.root; |
|
|
|
while( i<Tkns.length && tmp.CheckChild(Tkns[i],PrefixTranslation)>=0) |
|
{ |
|
if(i == Tkns.length-1){PrefixTranslation = 1;} |
|
tmp=tmp.links.get(tmp.CheckChild(Tkns[i],PrefixTranslation)); |
|
find=true; |
|
i++; |
|
} |
|
if(find == true) |
|
{ |
|
if(i==Tkns.length) |
|
{ |
|
if(!tmp.Concept.equals("")) |
|
{ |
|
return tmp.Concept; |
|
} |
|
else |
|
{ |
|
return "-1"; |
|
|
|
} |
|
} |
|
else |
|
{ |
|
return "-2"; |
|
|
|
} |
|
} |
|
else |
|
{ |
|
return "-3"; |
|
|
|
} |
|
} |
|
return "-3"; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
public ArrayList<String> SearchMentionLocation(String Doc,String ConceptType) |
|
{ |
|
ArrayList<String> location = new ArrayList<String>(); |
|
Doc=Doc+" XXXX XXXX"; |
|
String Doc_org=Doc; |
|
Doc=Doc.toLowerCase(); |
|
String Doc_lc=Doc; |
|
Doc = Doc.replaceAll("([0-9])([A-Za-z])", "$1 $2"); |
|
Doc = Doc.replaceAll("([A-Za-z])([0-9])", "$1 $2"); |
|
Doc = Doc.replaceAll("[\\W^;:,]+", " "); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
String DocTkns[]=Doc.split(" "); |
|
int Offset=0; |
|
int Start=0; |
|
int Last=0; |
|
int FirstTime=0; |
|
|
|
while(Doc_lc.length()>0 && Doc_lc.substring(0,1).matches("[\\W]")) |
|
{ |
|
Doc_lc=Doc_lc.substring(1); |
|
Offset++; |
|
} |
|
|
|
for(int i=0;i<DocTkns.length;i++) |
|
{ |
|
|
|
|
|
int pre_i=i; |
|
int pre_Start=Start; |
|
int pre_Last=Last; |
|
String pre_Doc_lc=Doc_lc; |
|
int pre_Offset=Offset; |
|
|
|
TreeNode tmp = Tr.root; |
|
boolean find=false; |
|
int PrefixTranslation=2; |
|
if(ConceptType.equals("Species")) |
|
{ |
|
PrefixTranslation=3; |
|
} |
|
int ConceptFound=i; |
|
String ConceptFound_STR=""; |
|
int FirstTime_while = -1; |
|
|
|
while( tmp.CheckChild(DocTkns[i],PrefixTranslation)>=0 ) |
|
{ |
|
FirstTime_while++; |
|
tmp=tmp.links.get(tmp.CheckChild(DocTkns[i],PrefixTranslation)); |
|
if(Start==0 && FirstTime>0){Start = Offset;} |
|
if(Doc_lc.length()>=DocTkns[i].length() && Doc_lc.substring(0,DocTkns[i].length()).equals(DocTkns[i])) |
|
{ |
|
if(DocTkns[i].length()>0) |
|
{ |
|
Doc_lc=Doc_lc.substring(DocTkns[i].length()); |
|
Offset=Offset+DocTkns[i].length(); |
|
} |
|
} |
|
Last = Offset; |
|
while(Doc_lc.length()>0 && Doc_lc.substring(0,1).matches("[\\W]")) |
|
{ |
|
Doc_lc=Doc_lc.substring(1); |
|
Offset++; |
|
} |
|
i++; |
|
|
|
if(ConceptType.equals("Species")) |
|
{ |
|
if(i<DocTkns.length-3 && DocTkns[i].matches("(str|strain|substr|substrain|subspecies|subsp|var|variant|pathovars|pv|biovar|bv)")) |
|
{ |
|
Doc_lc=Doc_lc.substring(DocTkns[i].length()); |
|
Offset=Offset+DocTkns[i].length(); |
|
Last = Offset; |
|
while(Doc_lc.length()>0 && Doc_lc.substring(0,1).matches("[\\W]")) |
|
{ |
|
Doc_lc=Doc_lc.substring(1); |
|
Offset++; |
|
} |
|
i++; |
|
} |
|
} |
|
|
|
if(!tmp.Concept.equals("") && (Last-Start>0)) |
|
{ |
|
if(Last<Doc_org.length()) |
|
{ |
|
ConceptFound=i; |
|
ConceptFound_STR=Start+"\t"+Last+"\t"+Doc_org.substring(Start, Last)+"\t"+tmp.Concept; |
|
|
|
} |
|
} |
|
|
|
find=true; |
|
if(i>=DocTkns.length){break;} |
|
else if(i==DocTkns.length-1){PrefixTranslation=2;} |
|
|
|
|
|
|
|
if(FirstTime_while==0) |
|
{ |
|
pre_i=i; |
|
pre_Start=Start; |
|
pre_Last=Last; |
|
pre_Doc_lc=Doc_lc; |
|
pre_Offset=Offset; |
|
} |
|
} |
|
|
|
if(find == true) |
|
{ |
|
|
|
if(!tmp.Concept.equals("")) |
|
{ |
|
if(Last<Doc_org.length() && Last>Start) |
|
{ |
|
location.add(Start+"\t"+Last+"\t"+Doc_org.substring(Start, Last)+"\t"+tmp.Concept); |
|
} |
|
} |
|
else |
|
{ |
|
if(!ConceptFound_STR.equals("")) |
|
{ |
|
location.add(ConceptFound_STR); |
|
i = ConceptFound + 1; |
|
} |
|
|
|
if(FirstTime_while>=1) |
|
{ |
|
i=pre_i; |
|
Start=pre_Start; |
|
Last=pre_Last; |
|
Doc_lc=pre_Doc_lc; |
|
Offset=pre_Offset; |
|
} |
|
} |
|
Start=0; |
|
Last=0; |
|
if(i>0){i--;} |
|
ConceptFound=i; |
|
ConceptFound_STR=""; |
|
} |
|
else |
|
{ |
|
|
|
|
|
if(FirstTime_while>=1 && tmp.Concept.equals("")) |
|
{ |
|
i=pre_i; |
|
Start=pre_Start; |
|
Last=pre_Last; |
|
Doc_lc=pre_Doc_lc; |
|
Offset=pre_Offset; |
|
} |
|
|
|
if(Doc_lc.length()>=DocTkns[i].length() && Doc_lc.substring(0,DocTkns[i].length()).equals(DocTkns[i])) |
|
{ |
|
if(DocTkns[i].length()>0) |
|
{ |
|
Doc_lc=Doc_lc.substring(DocTkns[i].length()); |
|
Offset=Offset+DocTkns[i].length(); |
|
} |
|
} |
|
} |
|
|
|
while(Doc_lc.length()>0 && Doc_lc.substring(0,1).matches("[\\W]")) |
|
{ |
|
Doc_lc=Doc_lc.substring(1); |
|
Offset++; |
|
} |
|
FirstTime++; |
|
|
|
|
|
} |
|
return location; |
|
} |
|
|
|
|
|
|
|
|
|
public String PrintTree() |
|
{ |
|
return Tr.PrintTree_preorder(Tr.root,""); |
|
} |
|
|
|
public void SaveTree(String outputfile) throws IOException |
|
{ |
|
BufferedWriter fr = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outputfile), "UTF-8")); |
|
Tr.SaveTree_preorder(Tr.root,"",fr); |
|
fr.close(); |
|
} |
|
|
|
|
|
public void insertMention(String Mention, String Identifier) |
|
{ |
|
Tr.insertMention(Mention,Identifier); |
|
} |
|
} |
|
|
|
class Tree |
|
{ |
|
|
|
|
|
|
|
public TreeNode root; |
|
|
|
public Tree() |
|
{ |
|
root = new TreeNode("-ROOT-"); |
|
} |
|
|
|
|
|
|
|
|
|
public void insertMention(String Mention, String Identifier) |
|
{ |
|
Mention=Mention.toLowerCase(); |
|
|
|
Mention = Mention.replaceAll("([0-9])([A-Za-z])", "$1 $2"); |
|
Mention = Mention.replaceAll("([A-Za-z])([0-9])", "$1 $2"); |
|
Mention = Mention.replaceAll("[\\W\\-\\_]+", " "); |
|
|
|
|
|
|
|
|
|
|
|
String Tokens[]=Mention.split(" "); |
|
TreeNode tmp = root; |
|
for(int i=0;i<Tokens.length;i++) |
|
{ |
|
if(tmp.CheckChild(Tokens[i],0)>=0) |
|
{ |
|
tmp=tmp.links.get( tmp.CheckChild(Tokens[i],0) ); |
|
if(i == Tokens.length-1) |
|
{ |
|
tmp.Concept=Identifier; |
|
} |
|
} |
|
else |
|
{ |
|
if(i == Tokens.length-1) |
|
{ |
|
tmp.InsertToken(Tokens[i],Identifier); |
|
} |
|
else |
|
{ |
|
tmp.InsertToken(Tokens[i]); |
|
} |
|
tmp=tmp.links.get(tmp.NumOflinks-1); |
|
} |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
public String PrintTree_preorder(TreeNode node, String LocationInTree) |
|
{ |
|
String opt=""; |
|
if(!node.token.equals("-ROOT-")) |
|
{ |
|
if(node.Concept.equals("")) |
|
{ |
|
opt=opt+LocationInTree+"\t"+node.token+"\n"; |
|
} |
|
else |
|
{ |
|
opt=opt+LocationInTree+"\t"+node.token+"\t"+node.Concept+"\n"; |
|
} |
|
} |
|
if(!LocationInTree.equals("")){LocationInTree=LocationInTree+"-";} |
|
for(int i=0;i<node.NumOflinks;i++) |
|
{ |
|
opt=opt+PrintTree_preorder(node.links.get(i),LocationInTree+(i+1)); |
|
} |
|
return opt; |
|
} |
|
|
|
|
|
|
|
|
|
public void SaveTree_preorder(TreeNode node, String LocationInTree, BufferedWriter fr) throws IOException |
|
{ |
|
if(!node.token.equals("-ROOT-")) |
|
{ |
|
if(node.Concept.equals("")) |
|
{ |
|
fr.write(LocationInTree+"\t"+node.token+"\n"); |
|
} |
|
else |
|
{ |
|
fr.write(LocationInTree+"\t"+node.token+"\t"+node.Concept+"\n"); |
|
} |
|
} |
|
if(!LocationInTree.equals("")){LocationInTree=LocationInTree+"-";} |
|
for(int i=0;i<node.NumOflinks;i++) |
|
{ |
|
SaveTree_preorder(node.links.get(i),LocationInTree+(i+1),fr); |
|
} |
|
} |
|
} |
|
|
|
class TreeNode |
|
{ |
|
String token; |
|
int NumOflinks; |
|
public String Concept; |
|
HashMap<String,Integer> Hashs; |
|
ArrayList<TreeNode> links; |
|
|
|
public TreeNode(String Tok,String ID) |
|
{ |
|
token = Tok; |
|
NumOflinks = 0; |
|
Concept = ID; |
|
links = new ArrayList<TreeNode>(); |
|
Hashs = new HashMap<String,Integer>(); |
|
} |
|
public TreeNode(String Tok) |
|
{ |
|
token = Tok; |
|
NumOflinks = 0; |
|
Concept = ""; |
|
links = new ArrayList<TreeNode>(); |
|
Hashs = new HashMap<String,Integer>(); |
|
} |
|
public TreeNode() |
|
{ |
|
token = ""; |
|
NumOflinks = 0; |
|
Concept = ""; |
|
links = new ArrayList<TreeNode>(); |
|
Hashs = new HashMap<String,Integer>(); |
|
} |
|
|
|
public String toString() |
|
{ |
|
return (token+"\t"+Concept); |
|
} |
|
|
|
|
|
|
|
|
|
public void InsertToken(String Tok) |
|
{ |
|
TreeNode NewNode = new TreeNode(Tok); |
|
|
|
|
|
links.add(NewNode); |
|
|
|
|
|
Hashs.put(Tok, NumOflinks); |
|
|
|
NumOflinks++; |
|
} |
|
public void InsertToken(String Tok,String ID) |
|
{ |
|
TreeNode NewNode = new TreeNode(Tok,ID); |
|
|
|
links.add(NewNode); |
|
|
|
|
|
Hashs.put(Tok, NumOflinks); |
|
|
|
NumOflinks++; |
|
} |
|
|
|
|
|
|
|
|
|
public int CheckChild(String Tok, Integer PrefixTranslation) |
|
{ |
|
if(Hashs.containsKey(Tok)) |
|
{ |
|
return(Hashs.get(Tok)); |
|
} |
|
|
|
if(PrefixTranslation == 1 && Tok.matches("(alpha|beta|gamam|[abg]|[12])")) |
|
{ |
|
if(Hashs.containsKey(GNormPlus.SuffixTranslationMap_hash.get(Tok))) |
|
{ |
|
return(Hashs.get(GNormPlus.SuffixTranslationMap_hash.get(Tok))); |
|
} |
|
|
|
} |
|
else if(PrefixTranslation == 2 && Tok.matches("[1-5]")) |
|
{ |
|
for(int i=0;i<links.size();i++) |
|
{ |
|
if(links.get(i).token.matches("[1-5]")) |
|
{ |
|
return(i); |
|
} |
|
} |
|
|
|
for(int i=1;i<=5;i++) |
|
{ |
|
if(Hashs.containsKey(i)){return(Hashs.get(i));} |
|
} |
|
} |
|
|
|
return(-1); |
|
} |
|
} |
|
|