EngTexToASLGloss

Runtime error

File size: 5,294 Bytes

8eb9635
 
 
 
 
 
fc90ecf
 
 
8eb9635

from pathlib import Path
import gradio as gr
import openai
import os
import tiktoken 

from huggingface_hub import login
login()

# Set  openAI key
HF_TOKEN = os.getenv("NextStar")
openai.api_key = HF_TOKEN

#Set prompt engineering paths (so globally available)
inStructionPath = "intro_instructions_combine.txt"
inRulesPath = "formatting_rules_expanded.txt"
inExamplesPath = "examples_longer1.txt"
inDialoguesPath = "examples_dialogues.txt"  

#Set to read in prompting files
def openReadFiles(inpath):
    infile = Path (inpath)
    with open(infile) as f:
        data = f.read()
    return data


# Set up prompting data (so globally available)
instruct = openReadFiles(inStructionPath)
rules = openReadFiles(inRulesPath)
examples = openReadFiles(inExamplesPath)
exampleDialogues = openReadFiles(inDialoguesPath)

### In case we eventually want to upload files
# def uploadText():
#     '''In case you want to upload a .txt file to translate to ASL gloss'''
#     readFile = input("Enter the file path of the .txt you'd like to translate to ASL gloss: ")
#     inFile = open(readFile, "r")
#     data = inFile.read()
#     inFile.close()
#     print(f"Your file {readFile} has been uploaded")
#     return data

def formatQuery(engText):
    """Add prompt instructions to English text for GPT4"""
    instruct = "Now, translate the following sentences to perfect ASL gloss using the grammatical, syntactic, and notation rules you just learned. \n\n"
    query = instruct+engText
    return query


def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens


def checkTokens(tokens):
    """Checks tokens to ensrue we can translate to ASL gloss"""
    goAhead = None
    if tokens >= 553:
        print(f"Cannot translate to ASL gloss at this time: too many tokens {tokens}")
        goAhead = False
    else:
        goAhead = True
        print(f"Has less than 553 tokens - can continue translating")
    return goAhead
    
def getGlossFromText(query):
    """Sets all for getting ASL gloss"""
    text = formatQuery(query)
    tokens = num_tokens_from_string(text, "cl100k_base")
    goAhead = checkTokens(tokens)
    if goAhead == True:
        results = getASLGloss(text)
    else:
        results = "Too many tokens: cannot translate"
    return results



def getASLGloss(testQs):
    """Get ASL gloss from OpenAI using our prompt engineering"""
    
    completion = openai.ChatCompletion.create(
      model = 'gpt-4',
      messages = [
          {"role": "system", "content": instruct},
          {"role": "system", "content": rules},
          {"role": "system", "content": examples},
          {"role": "system", "content": exampleDialogues},
          {"role": "user", "content": testQs},
      ],
    
      temperature = 0  
    )
    results = completion['choices'][0]['message']['content']
    return results
     

    

def main():
    
    title = "English to ASL Gloss"
    description = """Translate English text to ASL Gloss"""

    interface = gr.Interface(
        fn=getGlossFromText, 
        inputs="textbox", 
        outputs="text",
        title = title,
        description = description,
        examples=[["Every year I buy my dad a gift"], ["I always look forward to the family vacation"], 
                ["If I don't travel often, I am sad."]])
    interface.launch()
    

    
if __name__ == "__main__":
    main()

    
#     def getAnswer(query, texts = texts, embeddings = embeddings):
#         docsearch = FAISS.from_texts(texts, embeddings)
#         docs = docsearch.similarity_search(query)
#         chain = load_qa_chain(OpenAI(openai_api_key = HF_TOKEN, temperature=0), chain_type="map_reduce", return_map_steps=False)
#         response = chain({"input_documents": docs, "question": query}, return_only_outputs=True)
#             #interum_q = list(response.keys())
#         interum_a = list(response.values())
#         q = query
#         a = interum_a[0]
#         return a

#     # query = "describe the fisher database"
#     # docs = docsearch.similarity_search(query)
#     # chain = load_qa_chain(OpenAI(openai_api_key = "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", temperature=0), chain_type="map_reduce", return_map_steps=False)
#     # chain({"input_documents": docs, "question": query}, return_only_outputs=True)
#     title = "Query the S Drive!"
#     description = """This QA system will answer questions based on information in [data descriptions](https://indeocorp-my.sharepoint.com/:x:/g/personal/rrakov_sorenson_com/EWhs_Gpp9nNEukR7iJLd4mQBPREngKdRGYpT545jX8mY4Q?e=9EeEWF)"""

#     interface = gr.Interface(
#         fn=getAnswer, 
#         inputs="textbox", 
#         outputs="text",
#         title = title,
#         description = description,
#         examples=[["Where is the Fisher database?"], ["Where is the Defined Crowd audio?"], ["Do we have any Spanish audio data?"], 
#                 ["How many audio files do we have in the CallHome database?"]])
#     interface.launch()
    

    
# if __name__ == "__main__":
#     main()

# def main():
#     results = setMode()
#     print (results)
# main()