Spaces:

bertugmirasyedi
/

aristotle-api

Sleeping

File size: 12,639 Bytes

from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware

# Define the FastAPI app
app = FastAPI(docs_url="/")

# Add the CORS middleware to the app
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)


@app.get("/search={query}&similarity={similarity}")
def search(query, similarity="false"):
    import time
    import requests

    start_time = time.time()

    # Initialize the lists to store the results
    titles = []
    authors = []
    publishers = []
    descriptions = []
    images = []

    def gbooks_search(query, n_results=30):
        """
        Access the Google Books API and return the results.
        """
        # Set the API endpoint and query parameters
        url = "https://www.googleapis.com/books/v1/volumes"
        params = {"q": str(query), "printType": "books", "maxResults": n_results}

        # Send a GET request to the API with the specified parameters
        response = requests.get(url, params=params)

        # Parse the response JSON and append the results
        data = response.json()

        for item in data["items"]:
            volume_info = item["volumeInfo"]
            try:
                titles.append(f"{volume_info['title']}: {volume_info['subtitle']}")
            except KeyError:
                titles.append(volume_info["title"])

            try:
                descriptions.append(volume_info["description"])
            except KeyError:
                descriptions.append("Null")

            try:
                publishers.append(volume_info["publisher"])
            except KeyError:
                publishers.append("Null")

            try:
                authors.append(volume_info["authors"][0])
            except KeyError:
                authors.append("Null")

            try:
                images.append(volume_info["imageLinks"]["thumbnail"])
            except KeyError:
                images.append(
                    "https://bookstoreromanceday.org/wp-content/uploads/2020/08/book-cover-placeholder.png"
                )

        return titles, authors, publishers, descriptions, images

    # Run the gbooks_search function
    (
        titles_placeholder,
        authors_placeholder,
        publishers_placeholder,
        descriptions_placeholder,
        images_placeholder,
    ) = gbooks_search(query)

    # Append the results to the lists
    titles.extend(titles_placeholder)
    authors.extend(authors_placeholder)
    publishers.extend(publishers_placeholder)
    descriptions.extend(descriptions_placeholder)
    images.extend(images_placeholder)

    # Get the time since the start
    first_checkpoint = time.time()
    first_checkpoint_time = int(first_checkpoint - start_time)

    def openalex_search(query, n_results=10):
        """
        Run a search on OpenAlex and return the results.
        """
        import pyalex
        from pyalex import Works

        # Add email to the config
        pyalex.config.email = "[email protected]"

        # Define a pager object with the same query
        pager = Works().search(str(query)).paginate(per_page=n_results, n_max=n_results)

        # Generate a list of the results
        openalex_results = list(pager)

        # Get the titles, descriptions, and publishers and append them to the lists
        for result in openalex_results[0]:
            try:
                titles.append(result["title"])
            except KeyError:
                titles.append("Null")

            try:
                descriptions.append(result["abstract"])
            except KeyError:
                descriptions.append("Null")

            try:
                publishers.append(result["host_venue"]["publisher"])
            except KeyError:
                publishers.append("Null")

            try:
                authors.append(result["authorships"][0]["author"]["display_name"])
            except KeyError:
                authors.append("Null")

            images.append(
                "https://bookstoreromanceday.org/wp-content/uploads/2020/08/book-cover-placeholder.png"
            )

            return titles, authors, publishers, descriptions, images

    # Run the openalex_search function
    (
        titles_placeholder,
        authors_placeholder,
        publishers_placeholder,
        descriptions_placeholder,
        images_placeholder,
    ) = openalex_search(query)

    # Append the results to the lists
    titles.extend(titles_placeholder)
    authors.extend(authors_placeholder)
    publishers.extend(publishers_placeholder)
    descriptions.extend(descriptions_placeholder)
    images.extend(images_placeholder)

    # Calculate the elapsed time between the first and second checkpoints
    second_checkpoint = time.time()
    second_checkpoint_time = int(second_checkpoint - first_checkpoint)

    def openai_search(query, n_results=10):
        """
        Create a query to the OpenAI ChatGPT API and return the results.
        """
        import openai

        # Set the OpenAI API key
        openai.api_key = "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"

        # Create ChatGPT query
        chatgpt_response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {
                    "role": "system",
                    "content": "You are a librarian. You are helping a patron find a book.",
                },
                {
                    "role": "user",
                    "content": f"Recommend me {n_results} books about {query}. Your response should be like: 'title: <title>, author: <author>, publisher: <publisher>, summary: <summary>'",
                },
            ],
        )

        # Split the response into a list of results
        chatgpt_results = chatgpt_response["choices"][0]["message"]["content"].split(
            "\n"
        )[2::2]

        # Define a function to parse the results
        def parse_result(
            result, ordered_keys=["Title", "Author", "Publisher", "Summary"]
        ):
            # Create a dict to store the key-value pairs
            parsed_result = {}

            for key in ordered_keys:
                # Split the result string by the key and append the value to the list
                if key != ordered_keys[-1]:
                    parsed_result[key] = result.split(f"{key}: ")[1].split(",")[0]
                else:
                    parsed_result[key] = result.split(f"{key}: ")[1]

            return parsed_result

        ordered_keys = ["Title", "Author", "Publisher", "Summary"]

        for result in chatgpt_results:
            try:
                # Parse the result
                parsed_result = parse_result(result, ordered_keys=ordered_keys)

                # Append the parsed result to the lists
                titles.append(parsed_result["Title"])
                authors.append(parsed_result["Author"])
                publishers.append(parsed_result["Publisher"])
                descriptions.append(parsed_result["Summary"])
                images.append(
                    "https://bookstoreromanceday.org/wp-content/uploads/2020/08/book-cover-placeholder.png"
                )

            # In case the OpenAI API hits the limit
            except IndexError:
                break

        return titles, authors, publishers, descriptions, images

    # Run the openai_search function
    (
        titles_placeholder,
        authors_placeholder,
        publishers_placeholder,
        descriptions_placeholder,
        images_placeholder,
    ) = openai_search(query)

    # Append the results to the lists
    titles.extend(titles_placeholder)
    authors.extend(authors_placeholder)
    publishers.extend(publishers_placeholder)
    descriptions.extend(descriptions_placeholder)
    images.extend(images_placeholder)

    # Calculate the elapsed time between the second and third checkpoints
    third_checkpoint = time.time()
    third_checkpoint_time = int(third_checkpoint - second_checkpoint)

    def predict(titles, descriptions, publishers, similarity=similarity):
        """
        Create a summarizer and classifier pipeline and return the results.
        """
        from transformers import (
            AutoTokenizer,
            AutoModelForSeq2SeqLM,
            AutoModelForSequenceClassification,
            pipeline,
        )
        from sentence_transformers import SentenceTransformer

        # Combine title, description, and publisher into a single string
        combined_data = [
            f"The book's title is {title}. It is published by {publisher}. This book is about {description}"
            for title, description, publisher in zip(titles, descriptions, publishers)
        ]

        # Define the summarizer model and tokenizer
        sum_tokenizer = AutoTokenizer.from_pretrained("pszemraj/led-base-book-summary")

        sum_model = AutoModelForSeq2SeqLM.from_pretrained(
            "pszemraj/led-base-book-summary"
        )
        # sum_model = AutoModelForSeq2SeqLM.from_pretrained("lidiya/bart-base-samsum")

        summarizer_pipeline = pipeline(
            "summarization",
            model=sum_model,
            tokenizer=sum_tokenizer,
            batch_size=64,
        )

        # Define the zero-shot classifier
        zs_tokenizer = AutoTokenizer.from_pretrained(
            "sileod/deberta-v3-base-tasksource-nli"
        )

        zs_model = AutoModelForSequenceClassification.from_pretrained(
            "sileod/deberta-v3-base-tasksource-nli"
        )
        zs_classifier = pipeline(
            "zero-shot-classification",
            model=zs_model,
            tokenizer=zs_tokenizer,
            batch_size=64,
            hypothesis_template="This book is {}.",
            multi_label=True,
        )

        # Summarize the descriptions
        summaries = [
            summarizer_pipeline(description[0:1024])
            if (description != None)
            else [{"summary_text": "Null"}]
            for description in descriptions
        ]

        # Predict the level of the book
        candidate_labels = [
            "Introductory",
            "Advanced",
            "Academic",
            "Not Academic",
            "Manual",
        ]

        # Get the predicted labels
        classes = [zs_classifier(doc, candidate_labels) for doc in combined_data]

        # Calculate the similarity between the books
        if similarity != "false":
            from sentence_transformers import util

            sentence_transformer = SentenceTransformer("all-MiniLM-L6-v2")
            book_embeddings = sentence_transformer.encode(
                combined_data, convert_to_tensor=True
            )

            similar_books = []
            for i in range(len(titles)):
                current_embedding = book_embeddings[i]

                similarity_sorted = util.semantic_search(
                    current_embedding, book_embeddings, top_k=20
                )

                similar_books.append(
                    {
                        "sorted_by_similarity": similarity_sorted[0][1:],
                    }
                )
        else:
            similar_books = [{"sorted_by_similarity": []} for i in range(len(titles))]

        return summaries, classes, similar_books

    # Run the predict function
    summaries, classes, similar_books = predict(
        titles, descriptions, publishers, similarity=similarity
    )

    # Calculate the elapsed time between the third and fourth checkpoints
    fourth_checkpoint = time.time()
    fourth_checkpoint_time = int(fourth_checkpoint - third_checkpoint)

    # Calculate the elapsed time
    end_time = time.time()
    runtime = f"{end_time - start_time:.2f} seconds"

    # Create a list of dictionaries to store the results
    results = [
        {
            "id": i,
            "title": titles[i],
            "author": authors[i],
            "publisher": publishers[i],
            "image_link": images[i],
            "labels": classes[i]["labels"][0:2],
            "label_confidences": classes[i]["scores"][0:2],
            "summary": summaries[i][0]["summary_text"],
            "similar_books": similar_books[i]["sorted_by_similarity"],
            "checkpoints": [
                first_checkpoint_time,
                second_checkpoint_time,
                third_checkpoint_time,
                fourth_checkpoint_time,
            ],
            "runtime": runtime,
        }
        for i in range(len(titles))
    ]

    return results