Mark7549 commited on
Commit
4ea2d3a
·
1 Parent(s): dcc32dc

add file to extract info from lsj dictionary .xml files

Browse files
Files changed (1) hide show
  1. lsj_dict.py +74 -0
lsj_dict.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import xml.etree.ElementTree as ET
2
+ from collections import defaultdict
3
+ from autocomplete import load_compressed_word_list
4
+
5
+
6
+ def read_xml(file):
7
+ """
8
+ Read an XML file of the Greek LSJ dictionary
9
+ and return a dictionary with the words and their definitions.
10
+ """
11
+ tree = ET.parse(file)
12
+ root = tree.getroot()
13
+
14
+ xml_info = defaultdict(dict)
15
+
16
+ for entry in root.findall('.//entryFree'):
17
+ entry_info = extract_entry_info(entry)
18
+
19
+ xml_info[entry_info['word']] = entry_info
20
+
21
+ return xml_info
22
+
23
+
24
+ def extract_entry_info(entry):
25
+ """
26
+ Extract information from an entry in the LSJ dictionary.
27
+ """
28
+ word = entry.find('orth').text
29
+ definitions = defaultdict(dict)
30
+
31
+ # Save the lemma in the dictionary
32
+ lemma = entry.get('key')
33
+ definitions[word]['lemma'] = lemma
34
+
35
+ # Save the orthographies in the dictionary
36
+ orthographies = [orth.text for orth in entry.findall('orth')]
37
+ definitions[word]['orthographies'] = orthographies
38
+
39
+ # Check if there is a tr element with a definition
40
+ definition = ' '.join(entry.itertext()).strip()
41
+ definitions[word]['definitions'] = {'tr': definition}
42
+
43
+ for child in entry:
44
+ for grandchild in child.iter():
45
+ tag = grandchild.tag
46
+ text = grandchild.text or ""
47
+ tail = grandchild.tail or ""
48
+ if tag != "tr": # Avoiding 'tr' tag since it's handled separately
49
+ definitions[word]['definitions'][tag] = text + tail
50
+
51
+ return {'word': word, 'lemma': lemma, 'orthographies': orthographies, 'definitions': definitions[word]['definitions']}
52
+
53
+
54
+
55
+
56
+ def main():
57
+ merged_info = {}
58
+ for i in range(1, 28): # eng1 to eng27
59
+ file = f"LSJ_GreekUnicode/grc.lsj.perseus-eng{i}.xml"
60
+ xml_info = read_xml(file)
61
+ for word, info in xml_info.items():
62
+ # Merge dictionaries, assuming word is unique across all files
63
+ merged_info.setdefault(word, {}).update(info)
64
+
65
+ # Print lemmas from the merged dictionary
66
+ for word, info in merged_info.items():
67
+ print(info['lemma'])
68
+
69
+
70
+
71
+
72
+
73
+ if __name__ == "__main__":
74
+ main()