EngTexToASLGloss

Runtime error

File size: 6,656 Bytes

from pathlib import Path
import gradio as gr
import openai
import os
import tiktoken 


# Set secret key
HF_TOKEN = os.getenv("NextStar")


#Set prompt engineering paths (so globally available)
inStructionPath = "intro_instructions_combine.txt"
inRulesPath = "formatting_rules_expanded.txt"
inExamplesPath = "examples_longer1.txt"
inDialoguesPath = "examples_dialogues.txt"  

#Set to read in prompting files
def openReadFiles(inpath):
    infile = Path (inpath)
    with open(infile) as f:
        data = f.read()
    return data


# Set up prompting data (so globally available)
instruct = openReadFiles(inStructionPath)
rules = openReadFiles(inRulesPath)
examples = openReadFiles(inExamplesPath)
exampleDialogues = openReadFiles(inDialoguesPath)


def formatQuery(engText):
    """Add prompt instructions to English text for GPT4"""
    instruct = "Now, translate the following sentences to perfect ASL gloss using the grammatical, syntactic, and notation rules you just learned. \n\n"
    query = instruct+engText
    return query


def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens


def checkTokens(tokens):
    """Checks tokens to ensrue we can translate to ASL gloss"""
    goAhead = None
    if tokens >= 553:
        print(f"Cannot translate to ASL gloss at this time: too many tokens {tokens}")
        goAhead = False
    else:
        goAhead = True
        print(f"Has less than 553 tokens - can continue translating")
    return goAhead
    
def getGlossFromText(query):
    """Sets all for getting ASL gloss"""
    text = formatQuery(query)
    tokens = num_tokens_from_string(text, "cl100k_base")
    goAhead = checkTokens(tokens)
    if goAhead == True:
        results = getASLGloss(text)
    else:
        results = "Too many tokens: cannot translate"
    return results



def getASLGloss(testQs):
    """Get ASL gloss from OpenAI using our prompt engineering"""
    openai.api_key = HF_TOKEN
    completion = openai.ChatCompletion.create(
      model = 'gpt-4',
      messages = [
          {"role": "system", "content": instruct},
          {"role": "system", "content": rules},
          {"role": "system", "content": examples},
          {"role": "system", "content": exampleDialogues},
          {"role": "user", "content": testQs},
      ],
    
      temperature = 0  
    )
    results = completion['choices'][0]['message']['content']
    return results
     

    

def main():
    
    title = "English to ASL Gloss"
    #description = """Translate English text to ASL Gloss"""
    description = "This program uses GPT4 alongside prompt engineering to \
        translate English text to ASL gloss.\n \
        Type in the English sentence you would like to translate into ASL Gloss. \
        \n These are the rules for expressing superscript ASL gloss.\
        Anything within the angle brackets <> indicates superscript notation.\
        If the angle brackets are directly next to a word, the notation inside \
        the angle brackets is associate with just that word, e.g. WILL<A>.  \
        If the angle brackets are next to a whitespace after a work,\
        the notation inside the angle bracket is associated with all of the words\
        before it, up until a comma, another angle bracket, or a double space.\
        This sentence is an example of this rule:\
        \n NEXT-YEAR <Ti>, MY FIANCE <T>, TWO-OF-US MARRY <A>.\
        \n The superscript notation options are as follows:\
        \n Ti marks time\
        \n T marks topic\
        \n A marks comment\
        \n Y/N marks yes-no question\
        \n WHQ marks wh-question\
        \n RHQ marks rhetorical question\
        \n <Cond> marks conditional sentences\
        \n lower case marks directional verbs\
        \n ++ marks emphesis ('very' or 'a lot of')\
        \n # marks lexical fingerspelling \
        \n - marks space between individual letters of fingerspelling\
        \n Note: This is only a prototype of our final product.  It is imperfect \
        and is still in development.\
        \n For additional details on how the program works, please see \
        [the README](https://huggingface.co/spaces/rrakov/EngTexToASLGloss/blob/main/README.md)"

    interface = gr.Interface(
        fn=getGlossFromText, 
        inputs="textbox", 
        outputs="text",
        title = title,
        description = description)
        #examples = [[("Prompt: Every year I buy my dad a gift \n", "Result:  EVERY-YEAR<Ti>, MY DAD GIFT<T>, ME BUY<A>")]])
        # examples=[["Every year I buy my dad a gift"], ["I always look forward to the family vacation"], 
        #         ["If I don't travel often, I am sad."]])
    interface.launch()
    

    
if __name__ == "__main__":
    main()

    
#     def getAnswer(query, texts = texts, embeddings = embeddings):
#         docsearch = FAISS.from_texts(texts, embeddings)
#         docs = docsearch.similarity_search(query)
#         chain = load_qa_chain(OpenAI(openai_api_key = HF_TOKEN, temperature=0), chain_type="map_reduce", return_map_steps=False)
#         response = chain({"input_documents": docs, "question": query}, return_only_outputs=True)
#             #interum_q = list(response.keys())
#         interum_a = list(response.values())
#         q = query
#         a = interum_a[0]
#         return a

#     # query = "describe the fisher database"
#     # docs = docsearch.similarity_search(query)
#     # chain = load_qa_chain(OpenAI(openai_api_key = "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", temperature=0), chain_type="map_reduce", return_map_steps=False)
#     # chain({"input_documents": docs, "question": query}, return_only_outputs=True)
#     title = "Query the S Drive!"
#     description = """This QA system will answer questions based on information in [data descriptions](https://indeocorp-my.sharepoint.com/:x:/g/personal/rrakov_sorenson_com/EWhs_Gpp9nNEukR7iJLd4mQBPREngKdRGYpT545jX8mY4Q?e=9EeEWF)"""

#     interface = gr.Interface(
#         fn=getAnswer, 
#         inputs="textbox", 
#         outputs="text",
#         title = title,
#         description = description,
#         examples=[["Where is the Fisher database?"], ["Where is the Defined Crowd audio?"], ["Do we have any Spanish audio data?"], 
#                 ["How many audio files do we have in the CallHome database?"]])
#     interface.launch()
    

    
# if __name__ == "__main__":
#     main()

# def main():
#     results = setMode()
#     print (results)
# main()