File size: 2,164 Bytes
4ea2d3a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
import xml.etree.ElementTree as ET
from collections import defaultdict
from autocomplete import load_compressed_word_list
def read_xml(file):
"""
Read an XML file of the Greek LSJ dictionary
and return a dictionary with the words and their definitions.
"""
tree = ET.parse(file)
root = tree.getroot()
xml_info = defaultdict(dict)
for entry in root.findall('.//entryFree'):
entry_info = extract_entry_info(entry)
xml_info[entry_info['word']] = entry_info
return xml_info
def extract_entry_info(entry):
"""
Extract information from an entry in the LSJ dictionary.
"""
word = entry.find('orth').text
definitions = defaultdict(dict)
# Save the lemma in the dictionary
lemma = entry.get('key')
definitions[word]['lemma'] = lemma
# Save the orthographies in the dictionary
orthographies = [orth.text for orth in entry.findall('orth')]
definitions[word]['orthographies'] = orthographies
# Check if there is a tr element with a definition
definition = ' '.join(entry.itertext()).strip()
definitions[word]['definitions'] = {'tr': definition}
for child in entry:
for grandchild in child.iter():
tag = grandchild.tag
text = grandchild.text or ""
tail = grandchild.tail or ""
if tag != "tr": # Avoiding 'tr' tag since it's handled separately
definitions[word]['definitions'][tag] = text + tail
return {'word': word, 'lemma': lemma, 'orthographies': orthographies, 'definitions': definitions[word]['definitions']}
def main():
merged_info = {}
for i in range(1, 28): # eng1 to eng27
file = f"LSJ_GreekUnicode/grc.lsj.perseus-eng{i}.xml"
xml_info = read_xml(file)
for word, info in xml_info.items():
# Merge dictionaries, assuming word is unique across all files
merged_info.setdefault(word, {}).update(info)
# Print lemmas from the merged dictionary
for word, info in merged_info.items():
print(info['lemma'])
if __name__ == "__main__":
main()
|