Spaces:

GroNLP
/

agalma

Running

App Files Files Community

agalma / lsj_dict.py

Mark7549

add file to extract info from lsj dictionary .xml files

4ea2d3a about 1 year ago

raw

history blame

2.16 kB

	import xml.etree.ElementTree as ET
	from collections import defaultdict
	from autocomplete import load_compressed_word_list


	def read_xml(file):
	"""
	Read an XML file of the Greek LSJ dictionary
	and return a dictionary with the words and their definitions.
	"""
	tree = ET.parse(file)
	root = tree.getroot()

	xml_info = defaultdict(dict)

	for entry in root.findall('.//entryFree'):
	entry_info = extract_entry_info(entry)

	xml_info[entry_info['word']] = entry_info

	return xml_info


	def extract_entry_info(entry):
	"""
	Extract information from an entry in the LSJ dictionary.
	"""
	word = entry.find('orth').text
	definitions = defaultdict(dict)

	# Save the lemma in the dictionary
	lemma = entry.get('key')
	definitions[word]['lemma'] = lemma

	# Save the orthographies in the dictionary
	orthographies = [orth.text for orth in entry.findall('orth')]
	definitions[word]['orthographies'] = orthographies

	# Check if there is a tr element with a definition
	definition = ' '.join(entry.itertext()).strip()
	definitions[word]['definitions'] = {'tr': definition}

	for child in entry:
	for grandchild in child.iter():
	tag = grandchild.tag
	text = grandchild.text or ""
	tail = grandchild.tail or ""
	if tag != "tr": # Avoiding 'tr' tag since it's handled separately
	definitions[word]['definitions'][tag] = text + tail

	return {'word': word, 'lemma': lemma, 'orthographies': orthographies, 'definitions': definitions[word]['definitions']}




	def main():
	merged_info = {}
	for i in range(1, 28): # eng1 to eng27
	file = f"LSJ_GreekUnicode/grc.lsj.perseus-eng{i}.xml"
	xml_info = read_xml(file)
	for word, info in xml_info.items():
	# Merge dictionaries, assuming word is unique across all files
	merged_info.setdefault(word, {}).update(info)

	# Print lemmas from the merged dictionary
	for word, info in merged_info.items():
	print(info['lemma'])





	if __name__ == "__main__":
	main()