Spaces:

GroNLP
/

agalma

Running

App Files Files Community

Mark7549 commited on May 3, 2024

Commit

4ea2d3a

1 Parent(s): dcc32dc

add file to extract info from lsj dictionary .xml files

Browse files

Files changed (1) hide show

lsj_dict.py +74 -0

lsj_dict.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import xml.etree.ElementTree as ET
+from collections import defaultdict
+from autocomplete import load_compressed_word_list
+def read_xml(file):
+    """
+    Read an XML file of the Greek LSJ dictionary
+    and return a dictionary with the words and their definitions.
+    """
+    tree = ET.parse(file)
+    root = tree.getroot()
+    xml_info = defaultdict(dict)
+    for entry in root.findall('.//entryFree'):
+        entry_info = extract_entry_info(entry)
+        xml_info[entry_info['word']] = entry_info
+    return xml_info
+def extract_entry_info(entry):
+    """
+    Extract information from an entry in the LSJ dictionary.
+    """
+    word = entry.find('orth').text
+    definitions = defaultdict(dict)
+    # Save the lemma in the dictionary
+    lemma = entry.get('key')
+    definitions[word]['lemma'] = lemma
+    # Save the orthographies in the dictionary
+    orthographies = [orth.text for orth in entry.findall('orth')]
+    definitions[word]['orthographies'] = orthographies
+    # Check if there is a tr element with a definition
+    definition = ' '.join(entry.itertext()).strip()
+    definitions[word]['definitions'] = {'tr': definition}
+    for child in entry:
+        for grandchild in child.iter():
+            tag = grandchild.tag
+            text = grandchild.text or ""
+            tail = grandchild.tail or ""
+            if tag != "tr":  # Avoiding 'tr' tag since it's handled separately
+                definitions[word]['definitions'][tag] = text + tail
+    return {'word': word, 'lemma': lemma, 'orthographies': orthographies, 'definitions': definitions[word]['definitions']}
+def main():
+    merged_info = {}
+    for i in range(1, 28):  # eng1 to eng27
+        file = f"LSJ_GreekUnicode/grc.lsj.perseus-eng{i}.xml"
+        xml_info = read_xml(file)
+        for word, info in xml_info.items():
+            # Merge dictionaries, assuming word is unique across all files
+            merged_info.setdefault(word, {}).update(info)
+    # Print lemmas from the merged dictionary
+    for word, info in merged_info.items():
+        print(info['lemma'])
+if __name__ == "__main__":
+    main()