agalma / lsj_dict.py
Mark7549's picture
add file to extract info from lsj dictionary .xml files
4ea2d3a
raw
history blame
2.16 kB
import xml.etree.ElementTree as ET
from collections import defaultdict
from autocomplete import load_compressed_word_list
def read_xml(file):
"""
Read an XML file of the Greek LSJ dictionary
and return a dictionary with the words and their definitions.
"""
tree = ET.parse(file)
root = tree.getroot()
xml_info = defaultdict(dict)
for entry in root.findall('.//entryFree'):
entry_info = extract_entry_info(entry)
xml_info[entry_info['word']] = entry_info
return xml_info
def extract_entry_info(entry):
"""
Extract information from an entry in the LSJ dictionary.
"""
word = entry.find('orth').text
definitions = defaultdict(dict)
# Save the lemma in the dictionary
lemma = entry.get('key')
definitions[word]['lemma'] = lemma
# Save the orthographies in the dictionary
orthographies = [orth.text for orth in entry.findall('orth')]
definitions[word]['orthographies'] = orthographies
# Check if there is a tr element with a definition
definition = ' '.join(entry.itertext()).strip()
definitions[word]['definitions'] = {'tr': definition}
for child in entry:
for grandchild in child.iter():
tag = grandchild.tag
text = grandchild.text or ""
tail = grandchild.tail or ""
if tag != "tr": # Avoiding 'tr' tag since it's handled separately
definitions[word]['definitions'][tag] = text + tail
return {'word': word, 'lemma': lemma, 'orthographies': orthographies, 'definitions': definitions[word]['definitions']}
def main():
merged_info = {}
for i in range(1, 28): # eng1 to eng27
file = f"LSJ_GreekUnicode/grc.lsj.perseus-eng{i}.xml"
xml_info = read_xml(file)
for word, info in xml_info.items():
# Merge dictionaries, assuming word is unique across all files
merged_info.setdefault(word, {}).update(info)
# Print lemmas from the merged dictionary
for word, info in merged_info.items():
print(info['lemma'])
if __name__ == "__main__":
main()