Spaces:

caldervf
/

Cicero_Synthesizer_Space

Sleeping

File size: 5,070 Bytes

74c716c

import json
import os
import sys
from typing import Optional

import requests
import tiktoken


class Summarizer:
    def __init__(self, **kwargs):
        self.openai_endpoint = "https://api.openai.com/v1/chat/completions"

        # Prompt template
        self.prompt_template = self._get_prompt_template()

        # Type of model to use
        self.model = kwargs.get("model", "gpt-3.5-turbo")

        # Model hyperparameters
        self.max_tokens = kwargs.get("max_tokens", 4096)
        self.result_tokens = kwargs.get("result_tokens", 300)

        # Model encoding
        self.model_encoding = self._get_model_encoding()

        # Token length of the prompt template
        self.prompt_token_length = self._get_number_of_tokens(
            self.prompt_template
        )

    def _get_prompt_template(self, search_string=None) -> str:
        # Defining the template to use
        template_text = """
    Create a concise, clear, and in-depth summary of the following online
    article. Adhere to the following guidelines:

    1. Sound professional, detached and avoid emotionally charged language.
    2. Make sure to describe who is discussed in the article, what are
the events or concepts, when things happened, and, if this information is
available, why.
    3. The summary should be between one and three paragraphs.
"""
        if search_string:
            template_text += f"""
    4. Make sure to include and emphasize any information in the article that
relates to the following search string:
"{search_string}"
    """

        return template_text

    def _get_model_encoding(self):
        return tiktoken.encoding_for_model(self.model)

    def _get_number_of_tokens(self, input_text: str) -> int:
        """
        Method for determining the number of tokens of the input text.

        Parameters
        -----------
        input_text : str
            Text to use for calculating its token length.

        Returns
        ---------
        text_token_length : int
            Lenght of the tokens of the input text.
        """

        return len(self.model_encoding.encode(input_text))

    def _run_model(
        self,
        user_content: str,
        search_string: Optional[str] = None,
        temperature: Optional[float] = 1,
    ):
        """
        Method for running the model that will create the summary for a given
        observation.

        Parameters
        ------------
        user_content : str
            Content by the user that will be sent to the model via its API.

        temperature : float, optional
            Amount of ``temperature`` to give to the model. This parameter
            handles the amount of creativity that the model can have when
            creating the output response. This variable is set to ``1`` by
            default.

        Returns
        ----------
        """
        # Creating the headers
        headers = {
            "Content-Type": "application/json",
            "Authorization": f'Bearer {os.environ["OPENAI_API_KEY"]}',
        }
        # Composing the input messages
        messages = [
            {
                "role": "system",
                "content": self._get_prompt_template(search_string),
            },
            {"role": "user", "content": user_content},
        ]
        # Parsing the request data
        request_data = {
            "model": self.model,
            "messages": messages,
            "temperature": temperature,
        }
        # Extracting the response from the model's API
        response = requests.post(
            self.openai_endpoint,
            headers=headers,
            data=json.dumps(request_data),
            timeout=60,
        )

        # Checkig if the response was OK
        if response.status_code == 200:
            return response.json()["choices"][0]["message"]["content"]
        else:
            raise RuntimeError(
                f"HTTP request failed {response.status_code}, {response.text}"
            )

    def summarize(self, title, content, search_string=None):
        content_for_summary = f"{title}\n\n{content}"
        prompt_token_length = (
            self.prompt_token_length
            if search_string
            else self._get_number_of_tokens(
                self._get_prompt_template(search_string)
            )
        )
        data_token_length = self._get_number_of_tokens(content_for_summary)
        while data_token_length + prompt_token_length > self.max_tokens - 10:
            print("Decimating the content.")
            content = content.split()
            del content[::10]
            content = " ".join(content)
            content_for_summary = f"{title}\n\n{content}"
            data_token_length = self._get_number_of_tokens(content_for_summary)

        while True:
            try:
                return self._run_model(
                    user_content=content_for_summary,
                    search_string=search_string,
                )
            except Exception as e:
                print(e, file=sys.stderr)