Spaces:

GroNLP
/

agalma

Running

File size: 2,164 Bytes

4ea2d3a

import xml.etree.ElementTree as ET
from collections import defaultdict
from autocomplete import load_compressed_word_list


def read_xml(file):
    """
    Read an XML file of the Greek LSJ dictionary 
    and return a dictionary with the words and their definitions.
    """
    tree = ET.parse(file)
    root = tree.getroot()
    
    xml_info = defaultdict(dict)
    
    for entry in root.findall('.//entryFree'):
        entry_info = extract_entry_info(entry)
        
        xml_info[entry_info['word']] = entry_info
        
    return xml_info


def extract_entry_info(entry):
    """
    Extract information from an entry in the LSJ dictionary.
    """
    word = entry.find('orth').text
    definitions = defaultdict(dict)
    
    # Save the lemma in the dictionary
    lemma = entry.get('key')
    definitions[word]['lemma'] = lemma
    
    # Save the orthographies in the dictionary
    orthographies = [orth.text for orth in entry.findall('orth')]
    definitions[word]['orthographies'] = orthographies
    
    # Check if there is a tr element with a definition
    definition = ' '.join(entry.itertext()).strip()
    definitions[word]['definitions'] = {'tr': definition}

    for child in entry:
        for grandchild in child.iter():
            tag = grandchild.tag
            text = grandchild.text or ""
            tail = grandchild.tail or ""
            if tag != "tr":  # Avoiding 'tr' tag since it's handled separately
                definitions[word]['definitions'][tag] = text + tail
    
    return {'word': word, 'lemma': lemma, 'orthographies': orthographies, 'definitions': definitions[word]['definitions']}




def main():
    merged_info = {}
    for i in range(1, 28):  # eng1 to eng27
        file = f"LSJ_GreekUnicode/grc.lsj.perseus-eng{i}.xml"
        xml_info = read_xml(file)
        for word, info in xml_info.items():
            # Merge dictionaries, assuming word is unique across all files
            merged_info.setdefault(word, {}).update(info)
    
    # Print lemmas from the merged dictionary
    for word, info in merged_info.items():
        print(info['lemma'])





if __name__ == "__main__":
    main()