import os import openai import torch import tensorflow as tf from transformers import AutoTokenizer, TFAutoModelForQuestionAnswering import gradio as gr import re # Set your OpenAI API key here temporarily for testing openai.api_key = os.getenv("OPENAI_API_KEY") # Check if GPU is available and use it if possible device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Load the English models and tokenizers qa_model_name_v1 = 'salsarra/ConfliBERT-QA' qa_model_v1 = TFAutoModelForQuestionAnswering.from_pretrained(qa_model_name_v1) qa_tokenizer_v1 = AutoTokenizer.from_pretrained(qa_model_name_v1) bert_model_name_v1 = 'salsarra/BERT-base-cased-SQuAD-v1' bert_qa_model_v1 = TFAutoModelForQuestionAnswering.from_pretrained(bert_model_name_v1) bert_qa_tokenizer_v1 = AutoTokenizer.from_pretrained(bert_model_name_v1) # Load Spanish models and tokenizers confli_model_spanish_name = 'salsarra/ConfliBERT-Spanish-Beto-Cased-NewsQA' confli_model_spanish = TFAutoModelForQuestionAnswering.from_pretrained(confli_model_spanish_name) confli_tokenizer_spanish = AutoTokenizer.from_pretrained(confli_model_spanish_name) beto_model_spanish_name = 'salsarra/Beto-Spanish-Cased-NewsQA' beto_model_spanish = TFAutoModelForQuestionAnswering.from_pretrained(beto_model_spanish_name) beto_tokenizer_spanish = AutoTokenizer.from_pretrained(beto_model_spanish_name) # Load the additional Spanish models confli_sqac_model_spanish = 'salsarra/ConfliBERT-Spanish-Beto-Cased-SQAC' confli_sqac_model_spanish_qa = TFAutoModelForQuestionAnswering.from_pretrained(confli_sqac_model_spanish) confli_sqac_tokenizer_spanish = AutoTokenizer.from_pretrained(confli_sqac_model_spanish) beto_sqac_model_spanish = 'salsarra/Beto-Spanish-Cased-SQAC' beto_sqac_model_spanish_qa = TFAutoModelForQuestionAnswering.from_pretrained(beto_sqac_model_spanish) beto_sqac_tokenizer_spanish = AutoTokenizer.from_pretrained(beto_sqac_model_spanish) # Load specified ConfliBERT Arabic models confli_model_arabic_1_name = 'salsarra/ConfliBERT-Arabic-Arabertv2-QA-MLQA' confli_model_arabic_1 = TFAutoModelForQuestionAnswering.from_pretrained(confli_model_arabic_1_name) confli_tokenizer_arabic_1 = AutoTokenizer.from_pretrained(confli_model_arabic_1_name) confli_model_arabic_2_name = 'salsarra/ConfliBERT-Arabic-Arabertv2-QA-XQUAD' confli_model_arabic_2 = TFAutoModelForQuestionAnswering.from_pretrained(confli_model_arabic_2_name) confli_tokenizer_arabic_2 = AutoTokenizer.from_pretrained(confli_model_arabic_2_name) confli_model_arabic_3_name = 'salsarra/ConfliBERT-Arabic-Arabertv2-QA-ARCD' confli_model_arabic_3 = TFAutoModelForQuestionAnswering.from_pretrained(confli_model_arabic_3_name) confli_tokenizer_arabic_3 = AutoTokenizer.from_pretrained(confli_model_arabic_3_name) # Load specified BERT Arabic models (AraBERTv2) bert_model_arabic_1_name = 'salsarra/Bert-Base-Arabertv2-QA-MLQA' bert_qa_model_arabic_1 = TFAutoModelForQuestionAnswering.from_pretrained(bert_model_arabic_1_name) bert_qa_tokenizer_arabic_1 = AutoTokenizer.from_pretrained(bert_model_arabic_1_name) bert_model_arabic_2_name = 'salsarra/Bert-Base-Arabertv2-QA-XQUAD' bert_qa_model_arabic_2 = TFAutoModelForQuestionAnswering.from_pretrained(bert_model_arabic_2_name) bert_qa_tokenizer_arabic_2 = AutoTokenizer.from_pretrained(bert_model_arabic_2_name) bert_model_arabic_3_name = 'salsarra/Bert-Base-Arabertv2-QA-ARCD' bert_qa_model_arabic_3 = TFAutoModelForQuestionAnswering.from_pretrained(bert_model_arabic_3_name) bert_qa_tokenizer_arabic_3 = AutoTokenizer.from_pretrained(bert_model_arabic_3_name) # Define error handling to separate input size errors from other issues def handle_error_message(e, default_limit=512): error_message = str(e) pattern = re.compile(r"The size of tensor a \\((\\d+)\\) must match the size of tensor b \\((\\d+)\\)") match = pattern.search(error_message) if match: number_1, number_2 = match.groups() return f"Error: Text Input is over limit where inserted text size {number_1} is larger than model limits of {number_2}" pattern_qa = re.compile(r"indices\\[0,(\\d+)\\] = \\d+ is not in \\[0, (\\d+)\\)") match_qa = pattern_qa.search(error_message) if match_qa: number_1, number_2 = match_qa.groups() return f"Error: Text Input is over limit where inserted text size {number_1} is larger than model limits of {number_2}" return f"Error: {error_message}" # Define question_answering_v1 for ConfliBERT English with truncation=True def question_answering_v1(context, question): try: inputs = qa_tokenizer_v1(question, context, return_tensors='tf', truncation=True) outputs = qa_model_v1(inputs) answer_start = tf.argmax(outputs.start_logits, axis=1).numpy()[0] answer_end = tf.argmax(outputs.end_logits, axis=1).numpy()[0] + 1 answer = qa_tokenizer_v1.convert_tokens_to_string( qa_tokenizer_v1.convert_ids_to_tokens(inputs['input_ids'].numpy()[0][answer_start:answer_end]) ) return f"{answer}" except Exception as e: return handle_error_message(e) # Define bert_question_answering_v1 for BERT English with truncation=True def bert_question_answering_v1(context, question): try: inputs = bert_qa_tokenizer_v1(question, context, return_tensors='tf', truncation=True) outputs = bert_qa_model_v1(inputs) answer_start = tf.argmax(outputs.start_logits, axis=1).numpy()[0] answer_end = tf.argmax(outputs.end_logits, axis=1).numpy()[0] + 1 answer = bert_qa_tokenizer_v1.convert_tokens_to_string( bert_qa_tokenizer_v1.convert_ids_to_tokens(inputs['input_ids'].numpy()[0][answer_start:answer_end]) ) return f"{answer}" except Exception as e: return handle_error_message(e) # Define question_answering_spanish for ConfliBERT-Spanish-Beto-Cased-NewsQA def question_answering_spanish(context, question): try: inputs = confli_tokenizer_spanish.encode_plus(question, context, return_tensors='tf', truncation=True) outputs = confli_model_spanish(inputs) answer_start = tf.argmax(outputs.start_logits, axis=1).numpy()[0] answer_end = tf.argmax(outputs.end_logits, axis=1).numpy()[0] + 1 answer = confli_tokenizer_spanish.convert_tokens_to_string( confli_tokenizer_spanish.convert_ids_to_tokens(inputs['input_ids'].numpy()[0][answer_start:answer_end]) ) return f"{answer}" except Exception as e: return handle_error_message(e) # Define beto_question_answering_spanish for Beto-Spanish-Cased-NewsQA def beto_question_answering_spanish(context, question): try: inputs = beto_tokenizer_spanish.encode_plus(question, context, return_tensors='tf', truncation=True) outputs = beto_model_spanish(inputs) answer_start = tf.argmax(outputs.start_logits, axis=1).numpy()[0] answer_end = tf.argmax(outputs.end_logits, axis=1).numpy()[0] + 1 answer = beto_tokenizer_spanish.convert_tokens_to_string( beto_tokenizer_spanish.convert_ids_to_tokens(inputs['input_ids'].numpy()[0][answer_start:answer_end]) ) return f"{answer}" except Exception as e: return handle_error_message(e) # Define confli_sqac_question_answering_spanish for ConfliBERT-Spanish-Beto-Cased-SQAC def confli_sqac_question_answering_spanish(context, question): inputs = confli_sqac_tokenizer_spanish.encode_plus(question, context, return_tensors="tf", truncation=True) outputs = confli_sqac_model_spanish_qa(inputs) answer_start = tf.argmax(outputs.start_logits, axis=1).numpy()[0] answer_end = tf.argmax(outputs.end_logits, axis=1).numpy()[0] + 1 answer = confli_sqac_tokenizer_spanish.convert_tokens_to_string( confli_sqac_tokenizer_spanish.convert_ids_to_tokens(inputs['input_ids'].numpy()[0][answer_start:answer_end]) ) return f"{answer}" # Define beto_sqac_question_answering_spanish for Beto-Spanish-Cased-SQAC def beto_sqac_question_answering_spanish(context, question): inputs = beto_sqac_tokenizer_spanish.encode_plus(question, context, return_tensors="tf", truncation=True) outputs = beto_sqac_model_spanish_qa(inputs) answer_start = tf.argmax(outputs.start_logits, axis=1).numpy()[0] answer_end = tf.argmax(outputs.end_logits, axis=1).numpy()[0] + 1 answer = beto_sqac_tokenizer_spanish.convert_tokens_to_string( beto_sqac_tokenizer_spanish.convert_ids_to_tokens(inputs['input_ids'].numpy()[0][answer_start:answer_end]) ) return f"{answer}" # ConfliBERT Arabic Model 1 def question_answering_confli_arabic_1(context, question): try: inputs = confli_tokenizer_arabic_1(question, context, return_tensors='tf', truncation=True) outputs = confli_model_arabic_1(inputs) answer_start = tf.argmax(outputs.start_logits, axis=1).numpy()[0] answer_end = tf.argmax(outputs.end_logits, axis=1).numpy()[0] + 1 answer = confli_tokenizer_arabic_1.convert_tokens_to_string( confli_tokenizer_arabic_1.convert_ids_to_tokens(inputs['input_ids'].numpy()[0][answer_start:answer_end]) ) return f"{answer}" except Exception as e: return handle_error_message(e) # Add functions for other ConfliBERT and BERT models similarly def question_answering_confli_arabic_2(context, question): inputs = confli_tokenizer_arabic_2(question, context, return_tensors='tf', truncation=True) outputs = confli_model_arabic_2(inputs) answer_start = tf.argmax(outputs.start_logits, axis=1).numpy()[0] answer_end = tf.argmax(outputs.end_logits, axis=1).numpy()[0] + 1 answer = confli_tokenizer_arabic_2.convert_tokens_to_string( confli_tokenizer_arabic_2.convert_ids_to_tokens(inputs['input_ids'].numpy()[0][answer_start:answer_end]) ) return f"{answer}" def question_answering_confli_arabic_3(context, question): inputs = confli_tokenizer_arabic_3(question, context, return_tensors='tf', truncation=True) outputs = confli_model_arabic_3(inputs) answer_start = tf.argmax(outputs.start_logits, axis=1).numpy()[0] answer_end = tf.argmax(outputs.end_logits, axis=1).numpy()[0] + 1 answer = confli_tokenizer_arabic_3.convert_tokens_to_string( confli_tokenizer_arabic_3.convert_ids_to_tokens(inputs['input_ids'].numpy()[0][answer_start:answer_end]) ) return f"{answer}" # Similarly, for BERT models def question_answering_bert_arabic_1(context, question): inputs = bert_qa_tokenizer_arabic_1(question, context, return_tensors='tf', truncation=True) outputs = bert_qa_model_arabic_1(inputs) answer_start = tf.argmax(outputs.start_logits, axis=1).numpy()[0] answer_end = tf.argmax(outputs.end_logits, axis=1).numpy()[0] + 1 answer = bert_qa_tokenizer_arabic_1.convert_tokens_to_string( bert_qa_tokenizer_arabic_1.convert_ids_to_tokens(inputs['input_ids'].numpy()[0][answer_start:answer_end]) ) return f"{answer}" # BERT Arabic Model 2 (XQUAD) def question_answering_bert_arabic_2(context, question): try: inputs = bert_qa_tokenizer_arabic_2(question, context, return_tensors='tf', truncation=True) outputs = bert_qa_model_arabic_2(inputs) answer_start = tf.argmax(outputs.start_logits, axis=1).numpy()[0] answer_end = tf.argmax(outputs.end_logits, axis=1).numpy()[0] + 1 answer = bert_qa_tokenizer_arabic_2.convert_tokens_to_string( bert_qa_tokenizer_arabic_2.convert_ids_to_tokens(inputs['input_ids'].numpy()[0][answer_start:answer_end]) ) return f"{answer}" except Exception as e: return handle_error_message(e) # BERT Arabic Model 3 (ARCD) def question_answering_bert_arabic_3(context, question): try: inputs = bert_qa_tokenizer_arabic_3(question, context, return_tensors='tf', truncation=True) outputs = bert_qa_model_arabic_3(inputs) answer_start = tf.argmax(outputs.start_logits, axis=1).numpy()[0] answer_end = tf.argmax(outputs.end_logits, axis=1).numpy()[0] + 1 answer = bert_qa_tokenizer_arabic_3.convert_tokens_to_string( bert_qa_tokenizer_arabic_3.convert_ids_to_tokens(inputs['input_ids'].numpy()[0][answer_start:answer_end]) ) return f"{answer}" except Exception as e: return handle_error_message(e) # Define a function to get ChatGPT's answer in English using the latest OpenAI API def chatgpt_question_answering(context, question): messages = [ {"role": "system", "content": "You are a helpful assistant. Only answer based on the provided context. Do not use any external knowledge."}, {"role": "user", "content": f"Context: {context}\nQuestion: {question}\nAnswer:"} ] response = openai.ChatCompletion.create( model="gpt-3.5-turbo", messages=messages, max_tokens=500 ) return response['choices'][0]['message']['content'].strip() # Define a function to get ChatGPT's answer in Spanish using the latest OpenAI API def chatgpt_question_answering_spanish(context, question): messages = [ {"role": "system", "content": "You are a helpful assistant that responds in Spanish. Only answer based on the provided context. Do not use any external knowledge."}, {"role": "user", "content": f"Contexto: {context}\nPregunta: {question}\nRespuesta:"} ] response = openai.ChatCompletion.create( model="gpt-3.5-turbo", messages=messages, max_tokens=500 ) return response['choices'][0]['message']['content'].strip() # Define a function to get ChatGPT's answer in Arabic using the latest OpenAI API def chatgpt_question_answering_arabic(context, question): messages = [ {"role": "system", "content": "أنت مساعد ذكي ومفيد. أجب فقط بناءً على النص المُعطى في السياق. لا تستخدم أي معرفة خارجية."}, {"role": "user", "content": f"السياق: {context}\nالسؤال: {question}\nالإجابة:"} ] response = openai.ChatCompletion.create( model="gpt-3.5-turbo", messages=messages, max_tokens=500 ) return response['choices'][0]['message']['content'].strip() # Main comparison function with language selection def compare_question_answering(language, context, question): if language == "English": confli_answer_v1 = question_answering_v1(context, question) bert_answer_v1 = bert_question_answering_v1(context, question) chatgpt_answer = chatgpt_question_answering(context, question) return f"""