|
import os |
|
from typing import List |
|
from typing import Union |
|
|
|
import openai |
|
import tiktoken |
|
from dotenv import load_dotenv |
|
from icecream import ic |
|
from langchain_text_splitters import RecursiveCharacterTextSplitter |
|
|
|
|
|
load_dotenv() |
|
model = os.getenv("OPENAI_MODEL") or "gpt-4-turbo" |
|
client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"), base_url=os.getenv("OPENAI_BASE_URL")) |
|
|
|
|
|
MAX_TOKENS_PER_CHUNK = ( |
|
1000 |
|
) |
|
|
|
|
|
|
|
def get_completion( |
|
prompt: str, |
|
system_message: str = "You are a helpful assistant.", |
|
model: str = model, |
|
temperature: float = 0.3, |
|
json_mode: bool = False, |
|
) -> Union[str, dict]: |
|
""" |
|
Generate a completion using the OpenAI API. |
|
|
|
Args: |
|
prompt (str): The user's prompt or query. |
|
system_message (str, optional): The system message to set the context for the assistant. |
|
Defaults to "You are a helpful assistant.". |
|
model (str, optional): The name of the OpenAI model to use for generating the completion. |
|
Defaults to "gpt-4-turbo". |
|
temperature (float, optional): The sampling temperature for controlling the randomness of the generated text. |
|
Defaults to 0.3. |
|
json_mode (bool, optional): Whether to return the response in JSON format. |
|
Defaults to False. |
|
|
|
Returns: |
|
Union[str, dict]: The generated completion. |
|
If json_mode is True, returns the complete API response as a dictionary. |
|
If json_mode is False, returns the generated text as a string. |
|
""" |
|
|
|
if json_mode: |
|
response = client.chat.completions.create( |
|
model=model, |
|
temperature=temperature, |
|
top_p=1, |
|
response_format={"type": "json_object"}, |
|
messages=[ |
|
{"role": "system", "content": system_message}, |
|
{"role": "user", "content": prompt}, |
|
], |
|
) |
|
return response.choices[0].message.content |
|
else: |
|
response = client.chat.completions.create( |
|
model=model, |
|
temperature=temperature, |
|
top_p=1, |
|
messages=[ |
|
{"role": "system", "content": system_message}, |
|
{"role": "user", "content": prompt}, |
|
], |
|
) |
|
return response.choices[0].message.content |
|
|
|
|
|
def one_chunk_initial_translation( |
|
source_lang: str, target_lang: str, source_text: str |
|
) -> str: |
|
""" |
|
Translate the entire text as one chunk using an LLM. |
|
|
|
Args: |
|
source_lang (str): The source language of the text. |
|
target_lang (str): The target language for translation. |
|
source_text (str): The text to be translated. |
|
|
|
Returns: |
|
str: The translated text. |
|
""" |
|
|
|
system_message = f"You are an expert linguist, specializing in translation from {source_lang} to {target_lang}." |
|
|
|
translation_prompt = f"""This is an {source_lang} to {target_lang} translation, please provide the {target_lang} translation for this text. \ |
|
Do not provide any explanations or text apart from the translation. |
|
{source_lang}: {source_text} |
|
|
|
{target_lang}:""" |
|
|
|
prompt = translation_prompt.format(source_text=source_text) |
|
|
|
translation = get_completion(prompt, system_message=system_message) |
|
|
|
return translation |
|
|
|
|
|
def one_chunk_reflect_on_translation( |
|
source_lang: str, |
|
target_lang: str, |
|
source_text: str, |
|
translation_1: str, |
|
country: str = "", |
|
) -> str: |
|
""" |
|
Use an LLM to reflect on the translation, treating the entire text as one chunk. |
|
|
|
Args: |
|
source_lang (str): The source language of the text. |
|
target_lang (str): The target language of the translation. |
|
source_text (str): The original text in the source language. |
|
translation_1 (str): The initial translation of the source text. |
|
country (str): Country specified for target language. |
|
|
|
Returns: |
|
str: The LLM's reflection on the translation, providing constructive criticism and suggestions for improvement. |
|
""" |
|
|
|
system_message = f"You are an expert linguist specializing in translation from {source_lang} to {target_lang}. \ |
|
You will be provided with a source text and its translation and your goal is to improve the translation." |
|
|
|
if country != "": |
|
reflection_prompt = f"""Your task is to carefully read a source text and a translation from {source_lang} to {target_lang}, and then give constructive criticism and helpful suggestions to improve the translation. \ |
|
The final style and tone of the translation should match the style of {target_lang} colloquially spoken in {country}. |
|
|
|
The source text and initial translation, delimited by XML tags <SOURCE_TEXT></SOURCE_TEXT> and <TRANSLATION></TRANSLATION>, are as follows: |
|
|
|
<SOURCE_TEXT> |
|
{source_text} |
|
</SOURCE_TEXT> |
|
|
|
<TRANSLATION> |
|
{translation_1} |
|
</TRANSLATION> |
|
|
|
When writing suggestions, pay attention to whether there are ways to improve the translation's \n\ |
|
(i) accuracy (by correcting errors of addition, mistranslation, omission, or untranslated text),\n\ |
|
(ii) fluency (by applying {target_lang} grammar, spelling and punctuation rules, and ensuring there are no unnecessary repetitions),\n\ |
|
(iii) style (by ensuring the translations reflect the style of the source text and takes into account any cultural context),\n\ |
|
(iv) terminology (by ensuring terminology use is consistent and reflects the source text domain; and by only ensuring you use equivalent idioms {target_lang}).\n\ |
|
|
|
Write a list of specific, helpful and constructive suggestions for improving the translation. |
|
Each suggestion should address one specific part of the translation. |
|
Output only the suggestions and nothing else.""" |
|
|
|
else: |
|
reflection_prompt = f"""Your task is to carefully read a source text and a translation from {source_lang} to {target_lang}, and then give constructive criticism and helpful suggestions to improve the translation. \ |
|
|
|
The source text and initial translation, delimited by XML tags <SOURCE_TEXT></SOURCE_TEXT> and <TRANSLATION></TRANSLATION>, are as follows: |
|
|
|
<SOURCE_TEXT> |
|
{source_text} |
|
</SOURCE_TEXT> |
|
|
|
<TRANSLATION> |
|
{translation_1} |
|
</TRANSLATION> |
|
|
|
When writing suggestions, pay attention to whether there are ways to improve the translation's \n\ |
|
(i) accuracy (by correcting errors of addition, mistranslation, omission, or untranslated text),\n\ |
|
(ii) fluency (by applying {target_lang} grammar, spelling and punctuation rules, and ensuring there are no unnecessary repetitions),\n\ |
|
(iii) style (by ensuring the translations reflect the style of the source text and takes into account any cultural context),\n\ |
|
(iv) terminology (by ensuring terminology use is consistent and reflects the source text domain; and by only ensuring you use equivalent idioms {target_lang}).\n\ |
|
|
|
Write a list of specific, helpful and constructive suggestions for improving the translation. |
|
Each suggestion should address one specific part of the translation. |
|
Output only the suggestions and nothing else.""" |
|
|
|
prompt = reflection_prompt.format( |
|
source_lang=source_lang, |
|
target_lang=target_lang, |
|
source_text=source_text, |
|
translation_1=translation_1, |
|
) |
|
reflection = get_completion(prompt, system_message=system_message) |
|
return reflection |
|
|
|
|
|
def one_chunk_improve_translation( |
|
source_lang: str, |
|
target_lang: str, |
|
source_text: str, |
|
translation_1: str, |
|
reflection: str, |
|
) -> str: |
|
""" |
|
Use the reflection to improve the translation, treating the entire text as one chunk. |
|
|
|
Args: |
|
source_lang (str): The source language of the text. |
|
target_lang (str): The target language for the translation. |
|
source_text (str): The original text in the source language. |
|
translation_1 (str): The initial translation of the source text. |
|
reflection (str): Expert suggestions and constructive criticism for improving the translation. |
|
|
|
Returns: |
|
str: The improved translation based on the expert suggestions. |
|
""" |
|
|
|
system_message = f"You are an expert linguist, specializing in translation editing from {source_lang} to {target_lang}." |
|
|
|
prompt = f"""Your task is to carefully read, then edit, a translation from {source_lang} to {target_lang}, taking into |
|
account a list of expert suggestions and constructive criticisms. |
|
|
|
The source text, the initial translation, and the expert linguist suggestions are delimited by XML tags <SOURCE_TEXT></SOURCE_TEXT>, <TRANSLATION></TRANSLATION> and <EXPERT_SUGGESTIONS></EXPERT_SUGGESTIONS> \ |
|
as follows: |
|
|
|
<SOURCE_TEXT> |
|
{source_text} |
|
</SOURCE_TEXT> |
|
|
|
<TRANSLATION> |
|
{translation_1} |
|
</TRANSLATION> |
|
|
|
<EXPERT_SUGGESTIONS> |
|
{reflection} |
|
</EXPERT_SUGGESTIONS> |
|
|
|
Please take into account the expert suggestions when editing the translation. Edit the translation by ensuring: |
|
|
|
(i) accuracy (by correcting errors of addition, mistranslation, omission, or untranslated text), |
|
(ii) fluency (by applying {target_lang} grammar, spelling and punctuation rules and ensuring there are no unnecessary repetitions), \ |
|
(iii) style (by ensuring the translations reflect the style of the source text) |
|
(iv) terminology (inappropriate for context, inconsistent use), or |
|
(v) other errors. |
|
|
|
Output only the new translation and nothing else.""" |
|
|
|
translation_2 = get_completion(prompt, system_message) |
|
|
|
return translation_2 |
|
|
|
|
|
def one_chunk_translate_text( |
|
source_lang: str, target_lang: str, source_text: str, country: str = "" |
|
) -> str: |
|
""" |
|
Translate a single chunk of text from the source language to the target language. |
|
|
|
This function performs a two-step translation process: |
|
1. Get an initial translation of the source text. |
|
2. Reflect on the initial translation and generate an improved translation. |
|
|
|
Args: |
|
source_lang (str): The source language of the text. |
|
target_lang (str): The target language for the translation. |
|
source_text (str): The text to be translated. |
|
country (str): Country specified for target language. |
|
Returns: |
|
str: The improved translation of the source text. |
|
""" |
|
translation_1 = one_chunk_initial_translation( |
|
source_lang, target_lang, source_text |
|
) |
|
|
|
reflection = one_chunk_reflect_on_translation( |
|
source_lang, target_lang, source_text, translation_1, country |
|
) |
|
translation_2 = one_chunk_improve_translation( |
|
source_lang, target_lang, source_text, translation_1, reflection |
|
) |
|
|
|
return translation_2 |
|
|
|
|
|
def num_tokens_in_string( |
|
input_str: str, encoding_name: str = "cl100k_base" |
|
) -> int: |
|
""" |
|
Calculate the number of tokens in a given string using a specified encoding. |
|
|
|
Args: |
|
str (str): The input string to be tokenized. |
|
encoding_name (str, optional): The name of the encoding to use. Defaults to "cl100k_base", |
|
which is the most commonly used encoder (used by GPT-4). |
|
|
|
Returns: |
|
int: The number of tokens in the input string. |
|
|
|
Example: |
|
>>> text = "Hello, how are you?" |
|
>>> num_tokens = num_tokens_in_string(text) |
|
>>> print(num_tokens) |
|
5 |
|
""" |
|
encoding = tiktoken.get_encoding(encoding_name) |
|
num_tokens = len(encoding.encode(input_str)) |
|
return num_tokens |
|
|
|
|
|
def multichunk_initial_translation( |
|
source_lang: str, target_lang: str, source_text_chunks: List[str] |
|
) -> List[str]: |
|
""" |
|
Translate a text in multiple chunks from the source language to the target language. |
|
|
|
Args: |
|
source_lang (str): The source language of the text. |
|
target_lang (str): The target language for translation. |
|
source_text_chunks (List[str]): A list of text chunks to be translated. |
|
|
|
Returns: |
|
List[str]: A list of translated text chunks. |
|
""" |
|
|
|
system_message = f"You are an expert linguist, specializing in translation from {source_lang} to {target_lang}." |
|
|
|
translation_prompt = """Your task is provide a professional translation from {source_lang} to {target_lang} of PART of a text. |
|
|
|
The source text is below, delimited by XML tags <SOURCE_TEXT> and </SOURCE_TEXT>. Translate only the part within the source text |
|
delimited by <TRANSLATE_THIS> and </TRANSLATE_THIS>. You can use the rest of the source text as context, but do not translate any |
|
of the other text. Do not output anything other than the translation of the indicated part of the text. |
|
|
|
<SOURCE_TEXT> |
|
{tagged_text} |
|
</SOURCE_TEXT> |
|
|
|
To reiterate, you should translate only this part of the text, shown here again between <TRANSLATE_THIS> and </TRANSLATE_THIS>: |
|
<TRANSLATE_THIS> |
|
{chunk_to_translate} |
|
</TRANSLATE_THIS> |
|
|
|
Output only the translation of the portion you are asked to translate, and nothing else. |
|
""" |
|
|
|
translation_chunks = [] |
|
for i in range(len(source_text_chunks)): |
|
|
|
tagged_text = ( |
|
"".join(source_text_chunks[0:i]) |
|
+ "<TRANSLATE_THIS>" |
|
+ source_text_chunks[i] |
|
+ "</TRANSLATE_THIS>" |
|
+ "".join(source_text_chunks[i + 1 :]) |
|
) |
|
|
|
prompt = translation_prompt.format( |
|
source_lang=source_lang, |
|
target_lang=target_lang, |
|
tagged_text=tagged_text, |
|
chunk_to_translate=source_text_chunks[i], |
|
) |
|
|
|
translation = get_completion(prompt, system_message=system_message) |
|
translation_chunks.append(translation) |
|
|
|
return translation_chunks |
|
|
|
|
|
def multichunk_reflect_on_translation( |
|
source_lang: str, |
|
target_lang: str, |
|
source_text_chunks: List[str], |
|
translation_1_chunks: List[str], |
|
country: str = "", |
|
) -> List[str]: |
|
""" |
|
Provides constructive criticism and suggestions for improving a partial translation. |
|
|
|
Args: |
|
source_lang (str): The source language of the text. |
|
target_lang (str): The target language of the translation. |
|
source_text_chunks (List[str]): The source text divided into chunks. |
|
translation_1_chunks (List[str]): The translated chunks corresponding to the source text chunks. |
|
country (str): Country specified for target language. |
|
|
|
Returns: |
|
List[str]: A list of reflections containing suggestions for improving each translated chunk. |
|
""" |
|
|
|
system_message = f"You are an expert linguist specializing in translation from {source_lang} to {target_lang}. \ |
|
You will be provided with a source text and its translation and your goal is to improve the translation." |
|
|
|
if country != "": |
|
reflection_prompt = """Your task is to carefully read a source text and part of a translation of that text from {source_lang} to {target_lang}, and then give constructive criticism and helpful suggestions for improving the translation. |
|
The final style and tone of the translation should match the style of {target_lang} colloquially spoken in {country}. |
|
|
|
The source text is below, delimited by XML tags <SOURCE_TEXT> and </SOURCE_TEXT>, and the part that has been translated |
|
is delimited by <TRANSLATE_THIS> and </TRANSLATE_THIS> within the source text. You can use the rest of the source text |
|
as context for critiquing the translated part. |
|
|
|
<SOURCE_TEXT> |
|
{tagged_text} |
|
</SOURCE_TEXT> |
|
|
|
To reiterate, only part of the text is being translated, shown here again between <TRANSLATE_THIS> and </TRANSLATE_THIS>: |
|
<TRANSLATE_THIS> |
|
{chunk_to_translate} |
|
</TRANSLATE_THIS> |
|
|
|
The translation of the indicated part, delimited below by <TRANSLATION> and </TRANSLATION>, is as follows: |
|
<TRANSLATION> |
|
{translation_1_chunk} |
|
</TRANSLATION> |
|
|
|
When writing suggestions, pay attention to whether there are ways to improve the translation's:\n\ |
|
(i) accuracy (by correcting errors of addition, mistranslation, omission, or untranslated text),\n\ |
|
(ii) fluency (by applying {target_lang} grammar, spelling and punctuation rules, and ensuring there are no unnecessary repetitions),\n\ |
|
(iii) style (by ensuring the translations reflect the style of the source text and takes into account any cultural context),\n\ |
|
(iv) terminology (by ensuring terminology use is consistent and reflects the source text domain; and by only ensuring you use equivalent idioms {target_lang}).\n\ |
|
|
|
Write a list of specific, helpful and constructive suggestions for improving the translation. |
|
Each suggestion should address one specific part of the translation. |
|
Output only the suggestions and nothing else.""" |
|
|
|
else: |
|
reflection_prompt = """Your task is to carefully read a source text and part of a translation of that text from {source_lang} to {target_lang}, and then give constructive criticism and helpful suggestions for improving the translation. |
|
|
|
The source text is below, delimited by XML tags <SOURCE_TEXT> and </SOURCE_TEXT>, and the part that has been translated |
|
is delimited by <TRANSLATE_THIS> and </TRANSLATE_THIS> within the source text. You can use the rest of the source text |
|
as context for critiquing the translated part. |
|
|
|
<SOURCE_TEXT> |
|
{tagged_text} |
|
</SOURCE_TEXT> |
|
|
|
To reiterate, only part of the text is being translated, shown here again between <TRANSLATE_THIS> and </TRANSLATE_THIS>: |
|
<TRANSLATE_THIS> |
|
{chunk_to_translate} |
|
</TRANSLATE_THIS> |
|
|
|
The translation of the indicated part, delimited below by <TRANSLATION> and </TRANSLATION>, is as follows: |
|
<TRANSLATION> |
|
{translation_1_chunk} |
|
</TRANSLATION> |
|
|
|
When writing suggestions, pay attention to whether there are ways to improve the translation's:\n\ |
|
(i) accuracy (by correcting errors of addition, mistranslation, omission, or untranslated text),\n\ |
|
(ii) fluency (by applying {target_lang} grammar, spelling and punctuation rules, and ensuring there are no unnecessary repetitions),\n\ |
|
(iii) style (by ensuring the translations reflect the style of the source text and takes into account any cultural context),\n\ |
|
(iv) terminology (by ensuring terminology use is consistent and reflects the source text domain; and by only ensuring you use equivalent idioms {target_lang}).\n\ |
|
|
|
Write a list of specific, helpful and constructive suggestions for improving the translation. |
|
Each suggestion should address one specific part of the translation. |
|
Output only the suggestions and nothing else.""" |
|
|
|
reflection_chunks = [] |
|
for i in range(len(source_text_chunks)): |
|
|
|
tagged_text = ( |
|
"".join(source_text_chunks[0:i]) |
|
+ "<TRANSLATE_THIS>" |
|
+ source_text_chunks[i] |
|
+ "</TRANSLATE_THIS>" |
|
+ "".join(source_text_chunks[i + 1 :]) |
|
) |
|
if country != "": |
|
prompt = reflection_prompt.format( |
|
source_lang=source_lang, |
|
target_lang=target_lang, |
|
tagged_text=tagged_text, |
|
chunk_to_translate=source_text_chunks[i], |
|
translation_1_chunk=translation_1_chunks[i], |
|
country=country, |
|
) |
|
else: |
|
prompt = reflection_prompt.format( |
|
source_lang=source_lang, |
|
target_lang=target_lang, |
|
tagged_text=tagged_text, |
|
chunk_to_translate=source_text_chunks[i], |
|
translation_1_chunk=translation_1_chunks[i], |
|
) |
|
|
|
reflection = get_completion(prompt, system_message=system_message) |
|
reflection_chunks.append(reflection) |
|
|
|
return reflection_chunks |
|
|
|
|
|
def multichunk_improve_translation( |
|
source_lang: str, |
|
target_lang: str, |
|
source_text_chunks: List[str], |
|
translation_1_chunks: List[str], |
|
reflection_chunks: List[str], |
|
) -> List[str]: |
|
""" |
|
Improves the translation of a text from source language to target language by considering expert suggestions. |
|
|
|
Args: |
|
source_lang (str): The source language of the text. |
|
target_lang (str): The target language for translation. |
|
source_text_chunks (List[str]): The source text divided into chunks. |
|
translation_1_chunks (List[str]): The initial translation of each chunk. |
|
reflection_chunks (List[str]): Expert suggestions for improving each translated chunk. |
|
|
|
Returns: |
|
List[str]: The improved translation of each chunk. |
|
""" |
|
|
|
system_message = f"You are an expert linguist, specializing in translation editing from {source_lang} to {target_lang}." |
|
|
|
improvement_prompt = """Your task is to carefully read, then improve, a translation from {source_lang} to {target_lang}, taking into |
|
account a set of expert suggestions and constructive critisms. Below, the source text, initial translation, and expert suggestions are provided. |
|
|
|
The source text is below, delimited by XML tags <SOURCE_TEXT> and </SOURCE_TEXT>, and the part that has been translated |
|
is delimited by <TRANSLATE_THIS> and </TRANSLATE_THIS> within the source text. You can use the rest of the source text |
|
as context, but need to provide a translation only of the part indicated by <TRANSLATE_THIS> and </TRANSLATE_THIS>. |
|
|
|
<SOURCE_TEXT> |
|
{tagged_text} |
|
</SOURCE_TEXT> |
|
|
|
To reiterate, only part of the text is being translated, shown here again between <TRANSLATE_THIS> and </TRANSLATE_THIS>: |
|
<TRANSLATE_THIS> |
|
{chunk_to_translate} |
|
</TRANSLATE_THIS> |
|
|
|
The translation of the indicated part, delimited below by <TRANSLATION> and </TRANSLATION>, is as follows: |
|
<TRANSLATION> |
|
{translation_1_chunk} |
|
</TRANSLATION> |
|
|
|
The expert translations of the indicated part, delimited below by <EXPERT_SUGGESTIONS> and </EXPERT_SUGGESTIONS>, is as follows: |
|
<EXPERT_SUGGESTIONS> |
|
{reflection_chunk} |
|
</EXPERT_SUGGESTIONS> |
|
|
|
Taking into account the expert suggestions rewrite the translation to improve it, paying attention |
|
to whether there are ways to improve the translation's |
|
|
|
(i) accuracy (by correcting errors of addition, mistranslation, omission, or untranslated text), |
|
(ii) fluency (by applying {target_lang} grammar, spelling and punctuation rules and ensuring there are no unnecessary repetitions), \ |
|
(iii) style (by ensuring the translations reflect the style of the source text) |
|
(iv) terminology (inappropriate for context, inconsistent use), or |
|
(v) other errors. |
|
|
|
Output only the new translation of the indicated part and nothing else.""" |
|
|
|
translation_2_chunks = [] |
|
for i in range(len(source_text_chunks)): |
|
|
|
tagged_text = ( |
|
"".join(source_text_chunks[0:i]) |
|
+ "<TRANSLATE_THIS>" |
|
+ source_text_chunks[i] |
|
+ "</TRANSLATE_THIS>" |
|
+ "".join(source_text_chunks[i + 1 :]) |
|
) |
|
|
|
prompt = improvement_prompt.format( |
|
source_lang=source_lang, |
|
target_lang=target_lang, |
|
tagged_text=tagged_text, |
|
chunk_to_translate=source_text_chunks[i], |
|
translation_1_chunk=translation_1_chunks[i], |
|
reflection_chunk=reflection_chunks[i], |
|
) |
|
|
|
translation_2 = get_completion(prompt, system_message=system_message) |
|
translation_2_chunks.append(translation_2) |
|
|
|
return translation_2_chunks |
|
|
|
|
|
def multichunk_translation( |
|
source_lang, target_lang, source_text_chunks, country: str = "" |
|
): |
|
""" |
|
Improves the translation of multiple text chunks based on the initial translation and reflection. |
|
|
|
Args: |
|
source_lang (str): The source language of the text chunks. |
|
target_lang (str): The target language for translation. |
|
source_text_chunks (List[str]): The list of source text chunks to be translated. |
|
translation_1_chunks (List[str]): The list of initial translations for each source text chunk. |
|
reflection_chunks (List[str]): The list of reflections on the initial translations. |
|
country (str): Country specified for target language |
|
Returns: |
|
List[str]: The list of improved translations for each source text chunk. |
|
""" |
|
|
|
translation_1_chunks = multichunk_initial_translation( |
|
source_lang, target_lang, source_text_chunks |
|
) |
|
|
|
reflection_chunks = multichunk_reflect_on_translation( |
|
source_lang, |
|
target_lang, |
|
source_text_chunks, |
|
translation_1_chunks, |
|
country, |
|
) |
|
|
|
translation_2_chunks = multichunk_improve_translation( |
|
source_lang, |
|
target_lang, |
|
source_text_chunks, |
|
translation_1_chunks, |
|
reflection_chunks, |
|
) |
|
|
|
return translation_2_chunks |
|
|
|
|
|
def calculate_chunk_size(token_count: int, token_limit: int) -> int: |
|
""" |
|
Calculate the chunk size based on the token count and token limit. |
|
|
|
Args: |
|
token_count (int): The total number of tokens. |
|
token_limit (int): The maximum number of tokens allowed per chunk. |
|
|
|
Returns: |
|
int: The calculated chunk size. |
|
|
|
Description: |
|
This function calculates the chunk size based on the given token count and token limit. |
|
If the token count is less than or equal to the token limit, the function returns the token count as the chunk size. |
|
Otherwise, it calculates the number of chunks needed to accommodate all the tokens within the token limit. |
|
The chunk size is determined by dividing the token limit by the number of chunks. |
|
If there are remaining tokens after dividing the token count by the token limit, |
|
the chunk size is adjusted by adding the remaining tokens divided by the number of chunks. |
|
|
|
Example: |
|
>>> calculate_chunk_size(1000, 500) |
|
500 |
|
>>> calculate_chunk_size(1530, 500) |
|
389 |
|
>>> calculate_chunk_size(2242, 500) |
|
496 |
|
""" |
|
|
|
if token_count <= token_limit: |
|
return token_count |
|
|
|
num_chunks = (token_count + token_limit - 1) // token_limit |
|
chunk_size = token_count // num_chunks |
|
|
|
remaining_tokens = token_count % token_limit |
|
if remaining_tokens > 0: |
|
chunk_size += remaining_tokens // num_chunks |
|
|
|
return chunk_size |
|
|
|
|
|
def translate( |
|
source_lang, |
|
target_lang, |
|
source_text, |
|
country, |
|
max_tokens=MAX_TOKENS_PER_CHUNK, |
|
): |
|
"""Translate the source_text from source_lang to target_lang.""" |
|
|
|
num_tokens_in_text = num_tokens_in_string(source_text) |
|
|
|
ic(num_tokens_in_text) |
|
|
|
if num_tokens_in_text < max_tokens: |
|
ic("Translating text as single chunk") |
|
|
|
final_translation = one_chunk_translate_text( |
|
source_lang, target_lang, source_text, country |
|
) |
|
|
|
return final_translation |
|
|
|
else: |
|
ic("Translating text as multiple chunks") |
|
|
|
token_size = calculate_chunk_size( |
|
token_count=num_tokens_in_text, token_limit=max_tokens |
|
) |
|
|
|
ic(token_size) |
|
|
|
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder( |
|
model_name="gpt-4", |
|
chunk_size=token_size, |
|
chunk_overlap=0, |
|
) |
|
|
|
source_text_chunks = text_splitter.split_text(source_text) |
|
|
|
translation_2_chunks = multichunk_translation( |
|
source_lang, target_lang, source_text_chunks, country |
|
) |
|
|
|
return "".join(translation_2_chunks) |
|
|