|
import xml.etree.ElementTree as ET |
|
from collections import defaultdict |
|
from autocomplete import load_compressed_word_list |
|
|
|
|
|
def read_xml(file): |
|
""" |
|
Read an XML file of the Greek LSJ dictionary |
|
and return a dictionary with the words and their definitions. |
|
""" |
|
tree = ET.parse(file) |
|
root = tree.getroot() |
|
|
|
xml_info = defaultdict(dict) |
|
|
|
for entry in root.findall('.//entryFree'): |
|
entry_info = extract_entry_info(entry) |
|
|
|
xml_info[entry_info['word']] = entry_info |
|
|
|
return xml_info |
|
|
|
|
|
def extract_entry_info(entry): |
|
""" |
|
Extract information from an entry in the LSJ dictionary. |
|
""" |
|
word = entry.find('orth').text |
|
definitions = defaultdict(dict) |
|
|
|
|
|
lemma = entry.get('key') |
|
definitions[word]['lemma'] = lemma |
|
|
|
|
|
orthographies = [orth.text for orth in entry.findall('orth')] |
|
definitions[word]['orthographies'] = orthographies |
|
|
|
|
|
definition = ' '.join(entry.itertext()).strip() |
|
definitions[word]['definitions'] = {'tr': definition} |
|
|
|
for child in entry: |
|
for grandchild in child.iter(): |
|
tag = grandchild.tag |
|
text = grandchild.text or "" |
|
tail = grandchild.tail or "" |
|
if tag != "tr": |
|
definitions[word]['definitions'][tag] = text + tail |
|
|
|
return {'word': word, 'lemma': lemma, 'orthographies': orthographies, 'definitions': definitions[word]['definitions']} |
|
|
|
|
|
|
|
|
|
def main(): |
|
merged_info = {} |
|
for i in range(1, 28): |
|
file = f"LSJ_GreekUnicode/grc.lsj.perseus-eng{i}.xml" |
|
xml_info = read_xml(file) |
|
for word, info in xml_info.items(): |
|
|
|
merged_info.setdefault(word, {}).update(info) |
|
|
|
|
|
for word, info in merged_info.items(): |
|
print(info['lemma']) |
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|