Spaces:
Sleeping
Sleeping
File size: 5,070 Bytes
74c716c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
import json
import os
import sys
from typing import Optional
import requests
import tiktoken
class Summarizer:
def __init__(self, **kwargs):
self.openai_endpoint = "https://api.openai.com/v1/chat/completions"
# Prompt template
self.prompt_template = self._get_prompt_template()
# Type of model to use
self.model = kwargs.get("model", "gpt-3.5-turbo")
# Model hyperparameters
self.max_tokens = kwargs.get("max_tokens", 4096)
self.result_tokens = kwargs.get("result_tokens", 300)
# Model encoding
self.model_encoding = self._get_model_encoding()
# Token length of the prompt template
self.prompt_token_length = self._get_number_of_tokens(
self.prompt_template
)
def _get_prompt_template(self, search_string=None) -> str:
# Defining the template to use
template_text = """
Create a concise, clear, and in-depth summary of the following online
article. Adhere to the following guidelines:
1. Sound professional, detached and avoid emotionally charged language.
2. Make sure to describe who is discussed in the article, what are
the events or concepts, when things happened, and, if this information is
available, why.
3. The summary should be between one and three paragraphs.
"""
if search_string:
template_text += f"""
4. Make sure to include and emphasize any information in the article that
relates to the following search string:
"{search_string}"
"""
return template_text
def _get_model_encoding(self):
return tiktoken.encoding_for_model(self.model)
def _get_number_of_tokens(self, input_text: str) -> int:
"""
Method for determining the number of tokens of the input text.
Parameters
-----------
input_text : str
Text to use for calculating its token length.
Returns
---------
text_token_length : int
Lenght of the tokens of the input text.
"""
return len(self.model_encoding.encode(input_text))
def _run_model(
self,
user_content: str,
search_string: Optional[str] = None,
temperature: Optional[float] = 1,
):
"""
Method for running the model that will create the summary for a given
observation.
Parameters
------------
user_content : str
Content by the user that will be sent to the model via its API.
temperature : float, optional
Amount of ``temperature`` to give to the model. This parameter
handles the amount of creativity that the model can have when
creating the output response. This variable is set to ``1`` by
default.
Returns
----------
"""
# Creating the headers
headers = {
"Content-Type": "application/json",
"Authorization": f'Bearer {os.environ["OPENAI_API_KEY"]}',
}
# Composing the input messages
messages = [
{
"role": "system",
"content": self._get_prompt_template(search_string),
},
{"role": "user", "content": user_content},
]
# Parsing the request data
request_data = {
"model": self.model,
"messages": messages,
"temperature": temperature,
}
# Extracting the response from the model's API
response = requests.post(
self.openai_endpoint,
headers=headers,
data=json.dumps(request_data),
timeout=60,
)
# Checkig if the response was OK
if response.status_code == 200:
return response.json()["choices"][0]["message"]["content"]
else:
raise RuntimeError(
f"HTTP request failed {response.status_code}, {response.text}"
)
def summarize(self, title, content, search_string=None):
content_for_summary = f"{title}\n\n{content}"
prompt_token_length = (
self.prompt_token_length
if search_string
else self._get_number_of_tokens(
self._get_prompt_template(search_string)
)
)
data_token_length = self._get_number_of_tokens(content_for_summary)
while data_token_length + prompt_token_length > self.max_tokens - 10:
print("Decimating the content.")
content = content.split()
del content[::10]
content = " ".join(content)
content_for_summary = f"{title}\n\n{content}"
data_token_length = self._get_number_of_tokens(content_for_summary)
while True:
try:
return self._run_model(
user_content=content_for_summary,
search_string=search_string,
)
except Exception as e:
print(e, file=sys.stderr)
|