EngTexToASLGloss / eng_to_aslGloss_app.py
Rachel Rakov
Troubleshooting
1655f99
raw
history blame
5.25 kB
from pathlib import Path
import gradio as gr
import openai
import os
import tiktoken
# Set openAI key
HF_TOKEN = os.getenv("NextStar")
openai.api_key = HF_TOKEN
#Set prompt engineering paths (so globally available)
inStructionPath = "intro_instructions_combine.txt"
inRulesPath = "formatting_rules_expanded.txt"
inExamplesPath = "examples_longer1.txt"
inDialoguesPath = "examples_dialogues.txt"
#Set to read in prompting files
def openReadFiles(inpath):
infile = Path (inpath)
with open(infile) as f:
data = f.read()
return data
# Set up prompting data (so globally available)
instruct = openReadFiles(inStructionPath)
rules = openReadFiles(inRulesPath)
examples = openReadFiles(inExamplesPath)
exampleDialogues = openReadFiles(inDialoguesPath)
### In case we eventually want to upload files
# def uploadText():
# '''In case you want to upload a .txt file to translate to ASL gloss'''
# readFile = input("Enter the file path of the .txt you'd like to translate to ASL gloss: ")
# inFile = open(readFile, "r")
# data = inFile.read()
# inFile.close()
# print(f"Your file {readFile} has been uploaded")
# return data
def formatQuery(engText):
"""Add prompt instructions to English text for GPT4"""
instruct = "Now, translate the following sentences to perfect ASL gloss using the grammatical, syntactic, and notation rules you just learned. \n\n"
query = instruct+engText
return query
def num_tokens_from_string(string: str, encoding_name: str) -> int:
"""Returns the number of tokens in a text string."""
encoding = tiktoken.get_encoding(encoding_name)
num_tokens = len(encoding.encode(string))
return num_tokens
def checkTokens(tokens):
"""Checks tokens to ensrue we can translate to ASL gloss"""
goAhead = None
if tokens >= 553:
print(f"Cannot translate to ASL gloss at this time: too many tokens {tokens}")
goAhead = False
else:
goAhead = True
print(f"Has less than 553 tokens - can continue translating")
return goAhead
def getGlossFromText(query):
"""Sets all for getting ASL gloss"""
text = formatQuery(query)
tokens = num_tokens_from_string(text, "cl100k_base")
goAhead = checkTokens(tokens)
if goAhead == True:
results = getASLGloss(text)
else:
results = "Too many tokens: cannot translate"
return results
def getASLGloss(testQs):
"""Get ASL gloss from OpenAI using our prompt engineering"""
completion = openai.ChatCompletion.create(
model = 'gpt-4',
messages = [
{"role": "system", "content": instruct},
{"role": "system", "content": rules},
{"role": "system", "content": examples},
{"role": "system", "content": exampleDialogues},
{"role": "user", "content": testQs},
],
temperature = 0
)
results = completion['choices'][0]['message']['content']
return results
def main():
title = "English to ASL Gloss"
description = """Translate English text to ASL Gloss"""
interface = gr.Interface(
fn=getGlossFromText,
inputs="textbox",
outputs="text",
title = title,
description = description,
examples=[["Every year I buy my dad a gift"], ["I always look forward to the family vacation"],
["If I don't travel often, I am sad."]])
interface.launch()
if __name__ == "__main__":
main()
# def getAnswer(query, texts = texts, embeddings = embeddings):
# docsearch = FAISS.from_texts(texts, embeddings)
# docs = docsearch.similarity_search(query)
# chain = load_qa_chain(OpenAI(openai_api_key = HF_TOKEN, temperature=0), chain_type="map_reduce", return_map_steps=False)
# response = chain({"input_documents": docs, "question": query}, return_only_outputs=True)
# #interum_q = list(response.keys())
# interum_a = list(response.values())
# q = query
# a = interum_a[0]
# return a
# # query = "describe the fisher database"
# # docs = docsearch.similarity_search(query)
# # chain = load_qa_chain(OpenAI(openai_api_key = "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", temperature=0), chain_type="map_reduce", return_map_steps=False)
# # chain({"input_documents": docs, "question": query}, return_only_outputs=True)
# title = "Query the S Drive!"
# description = """This QA system will answer questions based on information in [data descriptions](https://indeocorp-my.sharepoint.com/:x:/g/personal/rrakov_sorenson_com/EWhs_Gpp9nNEukR7iJLd4mQBPREngKdRGYpT545jX8mY4Q?e=9EeEWF)"""
# interface = gr.Interface(
# fn=getAnswer,
# inputs="textbox",
# outputs="text",
# title = title,
# description = description,
# examples=[["Where is the Fisher database?"], ["Where is the Defined Crowd audio?"], ["Do we have any Spanish audio data?"],
# ["How many audio files do we have in the CallHome database?"]])
# interface.launch()
# if __name__ == "__main__":
# main()
# def main():
# results = setMode()
# print (results)
# main()