EngTexToASLGloss

Runtime error

EngTexToASLGloss / eng_to_aslGloss_app.py

Rachel Rakov

Improving instruction formatting

69cbf5e almost 2 years ago

6.94 kB

	from pathlib import Path
	import gradio as gr
	import openai
	import os
	import tiktoken


	# Set secret key
	HF_TOKEN = os.getenv("NextStar")


	#Set prompt engineering paths (so globally available)
	inStructionPath = "intro_instructions_combine.txt"
	inRulesPath = "formatting_rules_expanded.txt"
	inExamplesPath = "examples_longer1.txt"
	inDialoguesPath = "examples_dialogues.txt"

	#Set to read in prompting files
	def openReadFiles(inpath):
	infile = Path (inpath)
	with open(infile) as f:
	data = f.read()
	return data


	# Set up prompting data (so globally available)
	instruct = openReadFiles(inStructionPath)
	rules = openReadFiles(inRulesPath)
	examples = openReadFiles(inExamplesPath)
	exampleDialogues = openReadFiles(inDialoguesPath)


	def formatQuery(engText):
	"""Add prompt instructions to English text for GPT4"""
	instruct = "Now, translate the following sentences to perfect ASL gloss using the grammatical, syntactic, and notation rules you just learned. \n\n"
	query = instruct+engText
	return query


	def num_tokens_from_string(string: str, encoding_name: str) -> int:
	"""Returns the number of tokens in a text string."""
	encoding = tiktoken.get_encoding(encoding_name)
	num_tokens = len(encoding.encode(string))
	return num_tokens


	def checkTokens(tokens):
	"""Checks tokens to ensrue we can translate to ASL gloss"""
	goAhead = None
	if tokens >= 553:
	print(f"Cannot translate to ASL gloss at this time: too many tokens {tokens}")
	goAhead = False
	else:
	goAhead = True
	print(f"Has less than 553 tokens - can continue translating")
	return goAhead

	def getGlossFromText(query):
	"""Sets all for getting ASL gloss"""
	text = formatQuery(query)
	tokens = num_tokens_from_string(text, "cl100k_base")
	goAhead = checkTokens(tokens)
	if goAhead == True:
	results = getASLGloss(text)
	else:
	results = "Too many tokens: cannot translate"
	return results



	def getASLGloss(testQs):
	"""Get ASL gloss from OpenAI using our prompt engineering"""
	openai.api_key = HF_TOKEN
	completion = openai.ChatCompletion.create(
	model = 'gpt-4',
	messages = [
	{"role": "system", "content": instruct},
	{"role": "system", "content": rules},
	{"role": "system", "content": examples},
	{"role": "system", "content": exampleDialogues},
	{"role": "user", "content": testQs},
	],

	temperature = 0
	)
	results = completion['choices'][0]['message']['content']
	return results




	def main():

	title = "English to ASL Gloss"
	#description = """Translate English text to ASL Gloss"""
	description = "This program uses GPT4 alongside prompt engineering to \
	translate English text to ASL gloss.\n \
	<b>Type in the English sentence you would like to translate into ASL Gloss.</b> \
	\n \n This version of EngToASLGloss contains superscript notation which adds \
	grammatical context to assist in ASL generation. \
	\n Below are the guidelines we are using to express grammatical concepts \
	in ASL gloss.\
	Anything within the angle brackets < > indicates this additional grammatical notation.\
	If the angle brackets are directly next to a word, the notation inside \
	the angle brackets is associate with just that word, e.g. WILL < A >. \
	If the angle brackets are next to a whitespace after a word,\
	the notation inside the angle bracket is associated with all of the words\
	before it, up until a comma, another angle bracket, or a double space.\
	\n \n This sentence is an example of this rule:\
	\n NEXT-YEAR < Ti >, MY FIANCE < T >, TWO-OF-US MARRY \< A \>.\
	\n\r \
	\n The superscript notation options that will appear in results are as follows:\
	\n Ti marks time\
	\n T marks topic\
	\n A marks comment\
	\n Y/N marks yes-no question\
	\n WHQ marks wh-question\
	\n RHQ marks rhetorical question\
	\n < Cond > marks conditional sentences\
	\n lower case marks directional verbs\
	\n ++ marks emphesis ('very' or 'a lot of')\
	\n \# marks lexical fingerspelling \
	\n \- marks space between individual letters of fingerspelling\
	\n \n <b>Note: This is a prototype and is still in development. \
	Do not use it in a production deployment.</b> \
	\n For additional details on how the program works, please see \
	[the README](https://huggingface.co/spaces/rrakov/EngTexToASLGloss/blob/main/README.md)"

	interface = gr.Interface(
	fn=getGlossFromText,
	inputs="textbox",
	outputs="text",
	title = title,
	description = description)
	#examples = [[("Prompt: Every year I buy my dad a gift \n", "Result: EVERY-YEAR<Ti>, MY DAD GIFT<T>, ME BUY<A>")]])
	# examples=[["Every year I buy my dad a gift"], ["I always look forward to the family vacation"],
	# ["If I don't travel often, I am sad."]])
	interface.launch()



	if __name__ == "__main__":
	main()


	# def getAnswer(query, texts = texts, embeddings = embeddings):
	# docsearch = FAISS.from_texts(texts, embeddings)
	# docs = docsearch.similarity_search(query)
	# chain = load_qa_chain(OpenAI(openai_api_key = HF_TOKEN, temperature=0), chain_type="map_reduce", return_map_steps=False)
	# response = chain({"input_documents": docs, "question": query}, return_only_outputs=True)
	# #interum_q = list(response.keys())
	# interum_a = list(response.values())
	# q = query
	# a = interum_a[0]
	# return a

	# # query = "describe the fisher database"
	# # docs = docsearch.similarity_search(query)
	# # chain = load_qa_chain(OpenAI(openai_api_key = "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", temperature=0), chain_type="map_reduce", return_map_steps=False)
	# # chain({"input_documents": docs, "question": query}, return_only_outputs=True)
	# title = "Query the S Drive!"
	# description = """This QA system will answer questions based on information in [data descriptions](https://indeocorp-my.sharepoint.com/:x:/g/personal/rrakov_sorenson_com/EWhs_Gpp9nNEukR7iJLd4mQBPREngKdRGYpT545jX8mY4Q?e=9EeEWF)"""

	# interface = gr.Interface(
	# fn=getAnswer,
	# inputs="textbox",
	# outputs="text",
	# title = title,
	# description = description,
	# examples=[["Where is the Fisher database?"], ["Where is the Defined Crowd audio?"], ["Do we have any Spanish audio data?"],
	# ["How many audio files do we have in the CallHome database?"]])
	# interface.launch()



	# if __name__ == "__main__":
	# main()

	# def main():
	# results = setMode()
	# print (results)
	# main()