Spaces:

vkrishnan569
/

MCQ_Generator

Sleeping

App Files Files Community

Krishnan Palanisami commited on Apr 25, 2024

Commit

313428e

verified ·

1 Parent(s): 4a2ef03

Delete app.py

Browse files

Files changed (1) hide show

app.py +0 -575

app.py DELETED Viewed

@@ -1,575 +0,0 @@
-import streamlit as st
-import wikipedia
-from haystack.document_stores import InMemoryDocumentStore
-from haystack.utils import clean_wiki_text, convert_files_to_docs
-from haystack.nodes import TfidfRetriever, FARMReader
-from haystack.pipelines import ExtractiveQAPipeline
-from main import print_qa, QuestionGenerator
-import en_core_web_sm
-import json
-import numpy as np
-import random
-import re
-import torch
-from transformers import (
-    AutoTokenizer,
-    AutoModelForSeq2SeqLM,
-    AutoModelForSequenceClassification,
-)
-from typing import Any, List, Mapping, Tuple
-class QuestionGenerator:
-    """A transformer-based NLP system for generating reading comprehension-style questions from
-    texts. It can generate full sentence questions, multiple choice questions, or a mix of the
-    two styles.
-    To filter out low quality questions, questions are assigned a score and ranked once they have
-    been generated. Only the top k questions will be returned. This behaviour can be turned off
-    by setting use_evaluator=False.
-    """
-    def __init__(self) -> None:
-        QG_PRETRAINED = "iarfmoose/t5-base-question-generator"
-        self.ANSWER_TOKEN = "<answer>"
-        self.CONTEXT_TOKEN = "<context>"
-        self.SEQ_LENGTH = 512
-        self.device = torch.device(
-            "cuda" if torch.cuda.is_available() else "cpu")
-        self.qg_tokenizer = AutoTokenizer.from_pretrained(
-            QG_PRETRAINED, use_fast=False)
-        self.qg_model = AutoModelForSeq2SeqLM.from_pretrained(QG_PRETRAINED)
-        self.qg_model.to(self.device)
-        self.qg_model.eval()
-        self.qa_evaluator = QAEvaluator()
-    def generate(
-        self,
-        article: str,
-        use_evaluator: bool = True,
-        num_questions: bool = None,
-        answer_style: str = "all"
-    ) -> List:
-        """Takes an article and generates a set of question and answer pairs. If use_evaluator
-        is True then QA pairs will be ranked and filtered based on their quality. answer_style
-        should selected from ["all", "sentences", "multiple_choice"].
-        """
-        print("Generating questions...\n")
-        qg_inputs, qg_answers = self.generate_qg_inputs(article, answer_style)
-        generated_questions = self.generate_questions_from_inputs(qg_inputs)
-        message = "{} questions doesn't match {} answers".format(
-            len(generated_questions), len(qg_answers)
-        )
-        assert len(generated_questions) == len(qg_answers), message
-        if use_evaluator:
-            print("Evaluating QA pairs...\n")
-            encoded_qa_pairs = self.qa_evaluator.encode_qa_pairs(
-                generated_questions, qg_answers
-            )
-            scores = self.qa_evaluator.get_scores(encoded_qa_pairs)
-            if num_questions:
-                qa_list = self._get_ranked_qa_pairs(
-                    generated_questions, qg_answers, scores, num_questions
-                )
-            else:
-                qa_list = self._get_ranked_qa_pairs(
-                    generated_questions, qg_answers, scores
-                )
-        else:
-            print("Skipping evaluation step.\n")
-            qa_list = self._get_all_qa_pairs(generated_questions, qg_answers)
-        return qa_list
-    def generate_qg_inputs(self, text: str, answer_style: str) -> Tuple[List[str], List[str]]:
-        """Given a text, returns a list of model inputs and a list of corresponding answers.
-        Model inputs take the form "answer_token <answer text> context_token <context text>" where
-        the answer is a string extracted from the text, and the context is the wider text surrounding
-        the context.
-        """
-        VALID_ANSWER_STYLES = ["all", "sentences", "multiple_choice"]
-        if answer_style not in VALID_ANSWER_STYLES:
-            raise ValueError(
-                "Invalid answer style {}. Please choose from {}".format(
-                    answer_style, VALID_ANSWER_STYLES
-                )
-            )
-        inputs = []
-        answers = []
-        if answer_style == "sentences" or answer_style == "all":
-            segments = self._split_into_segments(text)
-            for segment in segments:
-                sentences = self._split_text(segment)
-                prepped_inputs, prepped_answers = self._prepare_qg_inputs(
-                    sentences, segment
-                )
-                inputs.extend(prepped_inputs)
-                answers.extend(prepped_answers)
-        if answer_style == "multiple_choice" or answer_style == "all":
-            sentences = self._split_text(text)
-            prepped_inputs, prepped_answers = self._prepare_qg_inputs_MC(
-                sentences
-            )
-            inputs.extend(prepped_inputs)
-            answers.extend(prepped_answers)
-        return inputs, answers
-    def generate_questions_from_inputs(self, qg_inputs: List) -> List[str]:
-        """Given a list of concatenated answers and contexts, with the form:
-        "answer_token <answer text> context_token <context text>", generates a list of
-        questions.
-        """
-        generated_questions = []
-        for qg_input in qg_inputs:
-            question = self._generate_question(qg_input)
-            generated_questions.append(question)
-        return generated_questions
-    def _split_text(self, text: str) -> List[str]:
-        """Splits the text into sentences, and attempts to split or truncate long sentences."""
-        MAX_SENTENCE_LEN = 128
-        sentences = re.findall(".*?[.!\?]", text)
-        cut_sentences = []
-        for sentence in sentences:
-            if len(sentence) > MAX_SENTENCE_LEN:
-                cut_sentences.extend(re.split("[,;:)]", sentence))
-        # remove useless post-quote sentence fragments
-        cut_sentences = [s for s in sentences if len(s.split(" ")) > 5]
-        sentences = sentences + cut_sentences
-        return list(set([s.strip(" ") for s in sentences]))
-    def _split_into_segments(self, text: str) -> List[str]:
-        """Splits a long text into segments short enough to be input into the transformer network.
-        Segments are used as context for question generation.
-        """
-        MAX_TOKENS = 490
-        paragraphs = text.split("\n")
-        tokenized_paragraphs = [
-            self.qg_tokenizer(p)["input_ids"] for p in paragraphs if len(p) > 0
-        ]
-        segments = []
-        while len(tokenized_paragraphs) > 0:
-            segment = []
-            while len(segment) < MAX_TOKENS and len(tokenized_paragraphs) > 0:
-                paragraph = tokenized_paragraphs.pop(0)
-                segment.extend(paragraph)
-            segments.append(segment)
-        return [self.qg_tokenizer.decode(s, skip_special_tokens=True) for s in segments]
-    def _prepare_qg_inputs(
-        self,
-        sentences: List[str],
-        text: str
-    ) -> Tuple[List[str], List[str]]:
-        """Uses sentences as answers and the text as context. Returns a tuple of (model inputs, answers).
-        Model inputs are "answer_token <answer text> context_token <context text>"
-        """
-        inputs = []
-        answers = []
-        for sentence in sentences:
-            qg_input = f"{self.ANSWER_TOKEN} {sentence} {self.CONTEXT_TOKEN} {text}"
-            inputs.append(qg_input)
-            answers.append(sentence)
-        return inputs, answers
-    def _prepare_qg_inputs_MC(self, sentences: List[str]) -> Tuple[List[str], List[str]]:
-        """Performs NER on the text, and uses extracted entities are candidate answers for multiple-choice
-        questions. Sentences are used as context, and entities as answers. Returns a tuple of (model inputs, answers).
-        Model inputs are "answer_token <answer text> context_token <context text>"
-        """
-        spacy_nlp = en_core_web_sm.load()
-        docs = list(spacy_nlp.pipe(sentences, disable=["parser"]))
-        inputs_from_text = []
-        answers_from_text = []
-        for doc, sentence in zip(docs, sentences):
-            entities = doc.ents
-            if entities:
-                for entity in entities:
-                    qg_input = f"{self.ANSWER_TOKEN} {entity} {self.CONTEXT_TOKEN} {sentence}"
-                    answers = self._get_MC_answers(entity, docs)
-                    inputs_from_text.append(qg_input)
-                    answers_from_text.append(answers)
-        return inputs_from_text, answers_from_text
-    def _get_MC_answers(self, correct_answer: Any, docs: Any) -> List[Mapping[str, Any]]:
-        """Finds a set of alternative answers for a multiple-choice question. Will attempt to find
-        alternatives of the same entity type as correct_answer if possible.
-        """
-        entities = []
-        for doc in docs:
-            entities.extend([{"text": e.text, "label_": e.label_} for e in doc.ents])
-        # Remove duplicate elements and convert to a list
-        entities_json = [json.dumps(kv) for kv in entities]
-        pool = sorted(set(entities_json))  # Convert pool to a sorted list
-        num_choices = min(4, len(pool)) - 1  # Number of choices to make
-        # Add the correct answer
-        final_choices = []
-        correct_label = correct_answer.label_
-        final_choices.append({"answer": correct_answer.text, "correct": True})
-        # Remove the correct answer from the pool
-        pool = [e for e in pool if e != json.dumps({"text": correct_answer.text, "label_": correct_answer.label_})]
-        # Find answers with the same NER label
-        matches = [e for e in pool if correct_label in e]
-        # If not enough matches, add other random answers
-        if len(matches) < num_choices:
-            choices = matches
-            remaining_choices = random.sample(sorted(pool), num_choices - len(choices))
-            choices.extend(remaining_choices)
-        else:
-            choices = random.sample(sorted(matches), num_choices)
-        choices = [json.loads(s) for s in choices]
-        for choice in choices:
-            final_choices.append({"answer": choice["text"], "correct": False})
-        random.shuffle(final_choices)
-        return final_choices
-    # def _get_MC_answers(self, correct_answer: Any, docs: Any) -> List[Mapping[str, Any]]:
-    #     """Finds a set of alternative answers for a multiple-choice question. Will attempt to find
-    #     alternatives of the same entity type as correct_answer if possible.
-    #     """
-    #     entities = []
-    #     for doc in docs:
-    #         entities.extend([{"text": e.text, "label_": e.label_}
-    #                         for e in doc.ents])
-    #     # remove duplicate elements
-    #     entities_json = [json.dumps(kv) for kv in entities]
-    #     pool = set(entities_json)
-    #     num_choices = (
-    #         min(4, len(pool)) - 1
-    #     )  # -1 because we already have the correct answer
-    #     # add the correct answer
-    #     final_choices = []
-    #     correct_label = correct_answer.label_
-    #     final_choices.append({"answer": correct_answer.text, "correct": True})
-    #     pool.remove(
-    #         json.dumps({"text": correct_answer.text,
-    #                    "label_": correct_answer.label_})
-    #     )
-    #     # find answers with the same NER label
-    #     matches = [e for e in pool if correct_label in e]
-    #     # if we don't have enough then add some other random answers
-    #     if len(matches) < num_choices:
-    #         choices = matches
-    #         pool = pool.difference(set(choices))
-    #         choices.extend(random.sample(pool, num_choices - len(choices)))
-    #     else:
-    #         choices = random.sample(matches, num_choices)
-    #     choices = [json.loads(s) for s in choices]
-    #     for choice in choices:
-    #         final_choices.append({"answer": choice["text"], "correct": False})
-    #     random.shuffle(final_choices)
-    #     return final_choices
-    @torch.no_grad()
-    def _generate_question(self, qg_input: str) -> str:
-        """Takes qg_input which is the concatenated answer and context, and uses it to generate
-        a question sentence. The generated question is decoded and then returned.
-        """
-        encoded_input = self._encode_qg_input(qg_input)
-        output = self.qg_model.generate(input_ids=encoded_input["input_ids"])
-        question = self.qg_tokenizer.decode(
-            output[0],
-            skip_special_tokens=True
-        )
-        return question
-    def _encode_qg_input(self, qg_input: str) -> torch.tensor:
-        """Tokenizes a string and returns a tensor of input ids corresponding to indices of tokens in
-        the vocab.
-        """
-        return self.qg_tokenizer(
-            qg_input,
-            padding='max_length',
-            max_length=self.SEQ_LENGTH,
-            truncation=True,
-            return_tensors="pt",
-        ).to(self.device)
-    def _get_ranked_qa_pairs(
-        self, generated_questions: List[str], qg_answers: List[str], scores, num_questions: int = 10
-    ) -> List[Mapping[str, str]]:
-        """Ranks generated questions according to scores, and returns the top num_questions examples.
-        """
-        if num_questions > len(scores):
-            num_questions = len(scores)
-            print((
-                f"\nWas only able to generate {num_questions} questions.",
-                "For more questions, please input a longer text.")
-            )
-        qa_list = []
-        for i in range(num_questions):
-            index = scores[i]
-            qa = {
-                "question": generated_questions[index].split("?")[0] + "?",
-                "answer": qg_answers[index]
-            }
-            qa_list.append(qa)
-        return qa_list
-    def _get_all_qa_pairs(self, generated_questions: List[str], qg_answers: List[str]):
-        """Formats question and answer pairs without ranking or filtering."""
-        qa_list = []
-        for question, answer in zip(generated_questions, qg_answers):
-            qa = {
-                "question": question.split("?")[0] + "?",
-                "answer": answer
-            }
-            qa_list.append(qa)
-        return qa_list
-class QAEvaluator:
-    """Wrapper for a transformer model which evaluates the quality of question-answer pairs.
-    Given a QA pair, the model will generate a score. Scores can be used to rank and filter
-    QA pairs.
-    """
-    def __init__(self) -> None:
-        QAE_PRETRAINED = "iarfmoose/bert-base-cased-qa-evaluator"
-        self.SEQ_LENGTH = 512
-        self.device = torch.device(
-            "cuda" if torch.cuda.is_available() else "cpu")
-        self.qae_tokenizer = AutoTokenizer.from_pretrained(QAE_PRETRAINED)
-        self.qae_model = AutoModelForSequenceClassification.from_pretrained(
-            QAE_PRETRAINED
-        )
-        self.qae_model.to(self.device)
-        self.qae_model.eval()
-    def encode_qa_pairs(self, questions: List[str], answers: List[str]) -> List[torch.tensor]:
-        """Takes a list of questions and a list of answers and encodes them as a list of tensors."""
-        encoded_pairs = []
-        for question, answer in zip(questions, answers):
-            encoded_qa = self._encode_qa(question, answer)
-            encoded_pairs.append(encoded_qa.to(self.device))
-        return encoded_pairs
-    def get_scores(self, encoded_qa_pairs: List[torch.tensor]) -> List[float]:
-        """Generates scores for a list of encoded QA pairs."""
-        scores = {}
-        for i in range(len(encoded_qa_pairs)):
-            scores[i] = self._evaluate_qa(encoded_qa_pairs[i])
-        return [
-            k for k, v in sorted(scores.items(), key=lambda item: item[1], reverse=True)
-        ]
-    def _encode_qa(self, question: str, answer: str) -> torch.tensor:
-        """Concatenates a question and answer, and then tokenizes them. Returns a tensor of
-        input ids corresponding to indices in the vocab.
-        """
-        if type(answer) is list:
-            for a in answer:
-                if a["correct"]:
-                    correct_answer = a["answer"]
-        else:
-            correct_answer = answer
-        return self.qae_tokenizer(
-            text=question,
-            text_pair=correct_answer,
-            padding="max_length",
-            max_length=self.SEQ_LENGTH,
-            truncation=True,
-            return_tensors="pt",
-        )
-    @torch.no_grad()
-    def _evaluate_qa(self, encoded_qa_pair: torch.tensor) -> float:
-        """Takes an encoded QA pair and returns a score."""
-        output = self.qae_model(**encoded_qa_pair)
-        return output[0][0][1]
-def print_qa(qa_list: List[Mapping[str, str]], show_answers: bool = True) -> None:
-    """Formats and prints a list of generated questions and answers."""
-    for i in range(len(qa_list)):
-        # wider space for 2 digit q nums
-        space = " " * int(np.where(i < 9, 3, 4))
-        print(f"{i + 1}) Q: {qa_list[i]['question']}")
-        answer = qa_list[i]["answer"]
-        # print a list of multiple choice answers
-        if type(answer) is list:
-            if show_answers:
-                print(
-                    f"{space}A: 1. {answer[0]['answer']} "
-                    f"{np.where(answer[0]['correct'], '(correct)', '')}"
-                )
-                for j in range(1, len(answer)):
-                    print(
-                        f"{space + '   '}{j + 1}. {answer[j]['answer']} "
-                        f"{np.where(answer[j]['correct']==True,'(correct)', '')}"
-                    )
-            else:
-                print(f"{space}A: 1. {answer[0]['answer']}")
-                for j in range(1, len(answer)):
-                    print(f"{space + '   '}{j + 1}. {answer[j]['answer']}")
-            print("")
-        # print full sentence answers
-        else:
-            if show_answers:
-                print(f"{space}A: {answer}\n")
-def main():
-    # Set the Streamlit app title
-    st.title("Question Generation using Haystack and Streamlit")
-    # Select the input type
-    inputs = ["Input Paragraph", "Wikipedia Examples"]
-    input_type = st.selectbox("Select an input type:", inputs)
-    # Initialize wiki_text as an empty string
-    wiki_text = ""
-    # Handle different input types
-    if input_type == "Input Paragraph":
-        # Allow user to input text paragraph
-        wiki_text = st.text_area("Input paragraph:", height=200)
-    elif input_type == "Wikipedia Examples":
-        # Define topics for selection
-        topics = ["Deep Learning", "Machine Learning"]
-        selected_topic = st.selectbox("Select a topic:", topics)
-        # Retrieve Wikipedia content based on the selected topic
-        if selected_topic:
-            wiki = wikipedia.page(selected_topic)
-            wiki_text = wiki.content
-        # Display the retrieved Wikipedia content (optional)
-        st.text_area("Retrieved Wikipedia content:", wiki_text, height=200)
-    # Preprocess the input text
-    wiki_text = clean_wiki_text(wiki_text)
-    # Allow user to specify the number of questions to generate
-    num_questions = st.slider("Number of questions to generate:", min_value=1, max_value=20, value=5)
-    # Allow user to specify the model to use
-    model_options = ["deepset/roberta-base-squad2", "deepset/roberta-base-squad2-distilled", "bert-large-uncased-whole-word-masking-squad2", "deepset/flan-t5-xl-squad2"]
-    model_name = st.selectbox("Select model:", model_options)
-    # Button to generate questions
-    if st.button("Generate Questions"):
-        document_store = InMemoryDocumentStore()
-        # Convert the preprocessed text into a document
-        document = {"content": wiki_text}
-        document_store.write_documents([document])
-        # Initialize a TfidfRetriever
-        retriever = TfidfRetriever(document_store=document_store)
-        # Initialize a FARMReader with the selected model
-        reader = FARMReader(model_name_or_path=model_name, use_gpu=False)
-        # Initialize the question generation pipeline
-        pipe = ExtractiveQAPipeline(reader, retriever)
-        # Initialize the QuestionGenerator
-        qg = QuestionGenerator()
-        # Generate multiple-choice questions
-        qa_list = qg.generate(
-            wiki_text,
-            num_questions=num_questions,
-            answer_style='multiple_choice'
-        )
-        # Display the generated questions and answers
-        st.header("Generated Questions and Answers:")
-        for idx, qa in enumerate(qa_list):
-            # Display the question
-            st.write(f"Question {idx + 1}: {qa['question']}")
-            # Display the answer options
-            if 'answer' in qa:
-                for i, option in enumerate(qa['answer']):
-                    correct_marker = "(correct)" if option["correct"] else ""
-                    st.write(f"Option {i + 1}: {option['answer']} {correct_marker}")
-            # Add a separator after each question-answer pair
-            st.write("-" * 40)
-# Run the Streamlit app
-if __name__ == "__main__":
-    main()