import ast
import functools
import json
import os
import sys
import tempfile
import time
import typing
import uuid
import pytest
from tests.utils import wrap_test_forked
from src.prompter_utils import base64_encode_jinja_template, base64_decode_jinja_template
from src.vision.utils_vision import process_file_list
from src.utils import get_list_or_str, read_popen_pipes, get_token_count, reverse_ucurve_list, undo_reverse_ucurve_list, \
is_uuid4, has_starting_code_block, extract_code_block_content, looks_like_json, get_json, is_full_git_hash, \
deduplicate_names, handle_json, check_input_type, start_faulthandler, remove, get_gradio_depth, create_typed_dict, \
execute_cmd_stream
from src.enums import invalid_json_str, user_prompt_for_fake_system_prompt0
from src.prompter import apply_chat_template
import subprocess as sp
start_faulthandler()
@wrap_test_forked
def test_get_list_or_str():
assert get_list_or_str(['foo', 'bar']) == ['foo', 'bar']
assert get_list_or_str('foo') == 'foo'
assert get_list_or_str("['foo', 'bar']") == ['foo', 'bar']
@wrap_test_forked
def test_stream_popen1():
cmd_python = sys.executable
python_args = "-q -u"
python_code = "print('hi')"
cmd = f"{cmd_python} {python_args} -c \"{python_code}\""
with sp.Popen(cmd, stdout=sp.PIPE, stderr=sp.PIPE, text=True, shell=True) as p:
for out_line, err_line in read_popen_pipes(p):
print(out_line, end='')
print(err_line, end='')
p.poll()
@wrap_test_forked
def test_stream_popen2():
script = """for i in 0 1 2 3 4 5
do
echo "This messages goes to stdout $i"
sleep 1
echo This message goes to stderr >&2
sleep 1
done
"""
with open('pieces.sh', 'wt') as f:
f.write(script)
os.chmod('pieces.sh', 0o755)
with sp.Popen(["./pieces.sh"], stdout=sp.PIPE, stderr=sp.PIPE, text=True, shell=True) as p:
for out_line, err_line in read_popen_pipes(p):
print(out_line, end='')
print(err_line, end='')
p.poll()
@wrap_test_forked
def test_stream_python_execution(capsys):
script = """
import sys
import time
for i in range(3):
print(f"This message goes to stdout {i}")
time.sleep(0.1)
print(f"This message goes to stderr {i}", file=sys.stderr)
time.sleep(0.1)
"""
result = execute_cmd_stream(
script_content=script,
cwd=None,
env=None,
timeout=5,
capture_output=True,
text=True,
print_tags=True,
print_literal=False,
)
# Capture the printed output
captured = capsys.readouterr()
# Print the captured output for verification
print("Captured output:")
print(captured.out)
# Check return code
assert result.returncode == 0, f"Expected return code 0, but got {result.returncode}"
# Check stdout content
expected_stdout = "This message goes to stdout 0\nThis message goes to stdout 1\nThis message goes to stdout 2\n"
assert expected_stdout in result.stdout, f"Expected stdout to contain:\n{expected_stdout}\nBut got:\n{result.stdout}"
# Check stderr content
expected_stderr = "This message goes to stderr 0\nThis message goes to stderr 1\nThis message goes to stderr 2\n"
assert expected_stderr in result.stderr, f"Expected stderr to contain:\n{expected_stderr}\nBut got:\n{result.stderr}"
# Check if the output was streamed (should appear in captured output)
assert "STDOUT: This message goes to stdout 0" in captured.out, "Streaming output not detected in stdout"
assert "STDERR: This message goes to stderr 0" in captured.out, "Streaming output not detected in stderr"
print("All tests passed successfully!")
def test_stream_python_execution_empty_lines(capsys):
script = """
import sys
import time
print()
print("Hello")
print()
print("World", file=sys.stderr)
print()
"""
result = execute_cmd_stream(
script_content=script,
cwd=None,
env=None,
timeout=5,
capture_output=True,
text=True
)
captured = capsys.readouterr()
print("Captured output:")
print(captured.out)
# Check that we only see STDOUT and STDERR for non-empty lines
assert captured.out.count("STDOUT:") == 1, "Expected only one STDOUT line"
assert captured.out.count("STDERR:") == 1, "Expected only one STDERR line"
assert "STDOUT: Hello" in captured.out, "Expected 'Hello' in stdout"
assert "STDERR: World" in captured.out, "Expected 'World' in stderr"
print("All tests passed successfully!")
@wrap_test_forked
def test_memory_limit():
result = execute_cmd_stream(cmd=['python', './tests/memory_hog_script.py'], max_memory_usage=500_000_000)
assert result.returncode == -15
print(result.stdout, file=sys.stderr, flush=True)
print(result.stderr, file=sys.stderr, flush=True)
@pytest.mark.parametrize("text_context_list",
['text_context_list1', 'text_context_list2', 'text_context_list3', 'text_context_list4',
'text_context_list5', 'text_context_list6'])
@pytest.mark.parametrize("system_prompt", ['auto', ''])
@pytest.mark.parametrize("context", ['context1', 'context2'])
@pytest.mark.parametrize("iinput", ['iinput1', 'iinput2'])
@pytest.mark.parametrize("chat_conversation", ['chat_conversation1', 'chat_conversation2'])
@pytest.mark.parametrize("instruction", ['instruction1', 'instruction2'])
@wrap_test_forked
def test_limited_prompt(instruction, chat_conversation, iinput, context, system_prompt, text_context_list):
instruction1 = 'Who are you?'
instruction2 = ' '.join(['foo_%s ' % x for x in range(0, 500)])
instruction = instruction1 if instruction == 'instruction1' else instruction2
iinput1 = 'Extra instruction info'
iinput2 = ' '.join(['iinput_%s ' % x for x in range(0, 500)])
iinput = iinput1 if iinput == 'iinput1' else iinput2
context1 = 'context'
context2 = ' '.join(['context_%s ' % x for x in range(0, 500)])
context = context1 if context == 'context1' else context2
chat_conversation1 = []
chat_conversation2 = [['user_conv_%s ' % x, 'bot_conv_%s ' % x] for x in range(0, 500)]
chat_conversation = chat_conversation1 if chat_conversation == 'chat_conversation1' else chat_conversation2
text_context_list1 = []
text_context_list2 = ['doc_%s ' % x for x in range(0, 500)]
text_context_list3 = ['doc_%s ' % x for x in range(0, 10)]
text_context_list4 = ['documentmany_%s ' % x for x in range(0, 10000)]
import random, string
text_context_list5 = [
'documentlong_%s_%s' % (x, ''.join(random.choices(string.ascii_letters + string.digits, k=300))) for x in
range(0, 20)]
text_context_list6 = [
'documentlong_%s_%s' % (x, ''.join(random.choices(string.ascii_letters + string.digits, k=4000))) for x in
range(0, 1)]
if text_context_list == 'text_context_list1':
text_context_list = text_context_list1
elif text_context_list == 'text_context_list2':
text_context_list = text_context_list2
elif text_context_list == 'text_context_list3':
text_context_list = text_context_list3
elif text_context_list == 'text_context_list4':
text_context_list = text_context_list4
elif text_context_list == 'text_context_list5':
text_context_list = text_context_list5
elif text_context_list == 'text_context_list6':
text_context_list = text_context_list6
else:
raise ValueError("No such %s" % text_context_list)
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('h2oai/h2ogpt-4096-llama2-7b-chat')
prompt_type = 'llama2'
prompt_dict = None
debug = False
chat = True
stream_output = True
from src.prompter import Prompter
prompter = Prompter(prompt_type, prompt_dict, debug=debug,
stream_output=stream_output,
system_prompt=system_prompt,
tokenizer=tokenizer)
min_max_new_tokens = 512 # like in get_limited_prompt()
max_input_tokens = -1
max_new_tokens = 1024
model_max_length = 4096
from src.gen import get_limited_prompt
estimated_full_prompt, \
instruction, iinput, context, \
num_prompt_tokens, max_new_tokens, \
num_prompt_tokens0, num_prompt_tokens_actual, \
history_to_use_final, external_handle_chat_conversation, \
top_k_docs_trial, one_doc_size, truncation_generation, system_prompt, _, _ = \
get_limited_prompt(instruction, iinput, tokenizer,
prompter=prompter,
max_new_tokens=max_new_tokens,
context=context,
chat_conversation=chat_conversation,
text_context_list=text_context_list,
model_max_length=model_max_length,
min_max_new_tokens=min_max_new_tokens,
max_input_tokens=max_input_tokens,
verbose=True)
print('%s -> %s or %s: len(history_to_use_final): %s top_k_docs_trial=%s one_doc_size: %s' % (num_prompt_tokens0,
num_prompt_tokens,
num_prompt_tokens_actual,
len(history_to_use_final),
top_k_docs_trial,
one_doc_size),
flush=True, file=sys.stderr)
assert num_prompt_tokens <= model_max_length + min_max_new_tokens
# actual might be less due to token merging for characters across parts, but not more
assert num_prompt_tokens >= num_prompt_tokens_actual
assert num_prompt_tokens_actual <= model_max_length
if top_k_docs_trial > 0:
text_context_list = text_context_list[:top_k_docs_trial]
elif one_doc_size is not None:
text_context_list = [text_context_list[0][:one_doc_size]]
else:
text_context_list = []
assert sum([get_token_count(x, tokenizer) for x in text_context_list]) <= model_max_length
@wrap_test_forked
def test_reverse_ucurve():
ab = []
a = [1, 2, 3, 4, 5, 6, 7, 8]
b = [2, 4, 6, 8, 7, 5, 3, 1]
ab.append([a, b])
a = [1]
b = [1]
ab.append([a, b])
a = [1, 2]
b = [2, 1]
ab.append([a, b])
a = [1, 2, 3]
b = [2, 3, 1]
ab.append([a, b])
a = [1, 2, 3, 4]
b = [2, 4, 3, 1]
ab.append([a, b])
for a, b in ab:
assert reverse_ucurve_list(a) == b
assert undo_reverse_ucurve_list(b) == a
@wrap_test_forked
def check_gradio():
import gradio as gr
assert gr.__h2oai__
@wrap_test_forked
def test_is_uuid4():
# Example usage:
test_strings = [
"f47ac10b-58cc-4372-a567-0e02b2c3d479", # Valid UUID v4
"not-a-uuid", # Invalid
"12345678-1234-1234-1234-123456789abc", # Valid UUID v4
"xyz" # Invalid
]
# "f47ac10b-58cc-4372-a567-0e02b2c3d479": True (Valid UUID v4)
# "not-a-uuid": False (Invalid)
# "12345678-1234-1234-1234-123456789abc": False (Invalid, even though it resembles a UUID, it doesn't follow the version 4 UUID pattern)
# "xyz": False (Invalid)
# Check each string and print whether it's a valid UUID v4
assert [is_uuid4(s) for s in test_strings] == [True, False, False, False]
@wrap_test_forked
def test_is_git_hash():
# Example usage:
hashes = ["1a3b5c7d9e1a3b5c7d9e1a3b5c7d9e1a3b5c7d9e", "1G3b5c7d9e1a3b5c7d9e1a3b5c7d9e1a3b5c7d9e", "1a3b5c7d"]
assert [is_full_git_hash(h) for h in hashes] == [True, False, False]
@wrap_test_forked
def test_chat_template():
instruction = "Who are you?"
system_prompt = "Be kind"
history_to_use = [('Are you awesome?', "Yes I'm awesome.")]
image_file = []
other_base_models = ['h2oai/mixtral-gm-rag-experimental-v2']
supports_system_prompt = ['meta-llama/Llama-2-7b-chat-hf', 'openchat/openchat-3.5-1210', 'SeaLLMs/SeaLLM-7B-v2',
'h2oai/h2ogpt-gm-experimental']
base_models = supports_system_prompt + other_base_models
for base_model in base_models:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model)
prompt = apply_chat_template(instruction, system_prompt, history_to_use, image_file,
tokenizer,
user_prompt_for_fake_system_prompt=user_prompt_for_fake_system_prompt0,
verbose=True)
assert 'Be kind' in prompt # put into pre-conversation if no actual system prompt
assert instruction in prompt
assert history_to_use[0][0] in prompt
assert history_to_use[0][1] in prompt
@wrap_test_forked
def test_chat_template_images():
history_to_use = [('Are you awesome?', "Yes I'm awesome.")]
base_model = 'OpenGVLab/InternVL-Chat-V1-5'
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
messages = [{'role': 'system',
'content': 'You are h2oGPTe, an expert question-answering AI system created by H2O.ai that performs like GPT-4 by OpenAI.'},
{'role': 'user',
'content': 'What is the name of the tower in one of the images?'}]
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
assert prompt is not None
(instruction, system_prompt, chat_conversation, image_file,
user_prompt_for_fake_system_prompt,
test_only, verbose) = ('What is the name of the tower in one of the images?',
'You are h2oGPTe, an expert question-answering AI system created by H2O.ai that performs like GPT-4 by OpenAI.',
[], ['/tmp/image_file_0f5f011d-c907-4836-9f38-0ba579b45ffc.jpeg',
'/tmp/image_file_60dce245-af39-4f8c-9651-df9ae0bd0afa.jpeg',
'/tmp/image_file_e0b32625-9de3-40d7-98fb-c2e6368d6d73.jpeg'], None, False, False)
prompt = apply_chat_template(instruction, system_prompt, history_to_use, image_file,
tokenizer,
user_prompt_for_fake_system_prompt=user_prompt_for_fake_system_prompt0,
test_only=test_only,
verbose=verbose)
assert 'h2oGPTe' in prompt # put into pre-conversation if no actual system prompt
assert instruction in prompt
assert history_to_use[0][0] in prompt
assert history_to_use[0][1] in prompt
@wrap_test_forked
def test_partial_codeblock():
json.dumps(invalid_json_str)
# Example usages:
example_1 = "```code block starts immediately"
example_2 = "\n ```code block after newline and spaces"
example_3 = "
```code block after HTML line break"
example_4 = "This is a regular text without a code block."
assert has_starting_code_block(example_1)
assert has_starting_code_block(example_2)
assert has_starting_code_block(example_3)
assert not has_starting_code_block(example_4)
# Example usages:
example_stream_1 = "```code block content here```more text"
example_stream_2 = "```code block content with no end yet..."
example_stream_3 = "```\ncode block content here\n```\nmore text"
example_stream_4 = "```\ncode block content \nwith no end yet..."
example_stream_5 = "\n ```\ncode block content here\n```\nmore text"
example_stream_6 = "\n ```\ncode block content \nwith no end yet..."
example_stream_7 = "more text"
assert extract_code_block_content(example_stream_1) == "block content here"
assert extract_code_block_content(example_stream_2) == "block content with no end yet..."
assert extract_code_block_content(example_stream_3) == "code block content here"
assert extract_code_block_content(example_stream_4) == "code block content \nwith no end yet..."
assert extract_code_block_content(example_stream_5) == "code block content here"
assert extract_code_block_content(example_stream_6) == "code block content \nwith no end yet..."
assert extract_code_block_content(example_stream_7) == ""
# Assuming the function extract_code_block_content is defined as previously described.
# Test case 1: Empty string
assert extract_code_block_content("") is '', "Test 1 Failed: Should return None for empty string"
# Test case 2: No starting code block
assert extract_code_block_content(
"No code block here") is '', "Test 2 Failed: Should return None if there's no starting code block"
# Test case 3: Code block at the start without ending
assert extract_code_block_content(
"```text\nStarting without end") == "Starting without end", "Test 3 Failed: Should return the content of code block starting at the beginning"
# Test case 4: Code block at the end without starting
assert extract_code_block_content(
"Text before code block```text\nEnding without start") == "Ending without start", "Test 4 Failed: Should extract text following starting delimiter regardless of position"
# Test case 5: Code block in the middle with proper closing
assert extract_code_block_content(
"Text before ```text\ncode block``` text after") == "code block", "Test 5 Failed: Should extract the code block in the middle"
# Test case 6: Multiple code blocks, only extracts the first one
assert extract_code_block_content(
"```text\nFirst code block``` Text in between ```Second code block```") == "First code block", "Test 6 Failed: Should only extract the first code block"
# Test case 7: Code block with only whitespace inside
assert extract_code_block_content(
"``` ```") == "", "Test 7 Failed: Should return an empty string for a code block with only whitespace"
# Test case 8: Newline characters inside code block
assert extract_code_block_content(
"```\nLine 1\nLine 2\n```") == "Line 1\nLine 2", "Test 8 Failed: Should preserve newline characters within code block but not leading/trailing newlines due to .strip()"
# Test case 9: Code block with special characters
special_characters = "```text\nSpecial characters !@#$%^&*()```"
assert extract_code_block_content(
special_characters) == "Special characters !@#$%^&*()", "Test 9 Failed: Should correctly handle special characters"
# Test case 10: No starting code block but with ending delimiter
assert extract_code_block_content(
"Text with ending code block delimiter```") is '', "Test 10 Failed: Should return None if there's no starting code block but with an ending delimiter"
# Test cases
assert looks_like_json('{ "key": "value" }'), "Failed: JSON object"
assert looks_like_json('[1, 2, 3]'), "Failed: JSON array"
assert looks_like_json(' "string" '), "Failed: JSON string"
assert looks_like_json('null'), "Failed: JSON null"
assert looks_like_json(' true '), "Failed: JSON true"
assert looks_like_json('123'), "Failed: JSON number"
assert not looks_like_json('Just a plain text'), "Failed: Not JSON"
assert not looks_like_json('```code block```'), "Failed: Code block"
# Test cases
get_json_nofixup = functools.partial(get_json, fixup=False)
assert get_json_nofixup(
'{"key": "value"}') == '{"key": "value"}', "Failed: Valid JSON object should be returned as is."
assert get_json_nofixup('[1, 2, 3]') == '[1, 2, 3]', "Failed: Valid JSON array should be returned as is."
assert get_json_nofixup('```text\nSome code```') == 'Some code', "Failed: Code block content should be returned."
assert get_json_nofixup(
'Some random text') == invalid_json_str, "Failed: Random text should lead to 'invalid json' return."
assert get_json_nofixup(
'```{"key": "value in code block"}```') == '{"key": "value in code block"}', "Failed: JSON in code block should be correctly extracted and returned."
assert get_json_nofixup(
'```code\nmore code```') == 'more code', "Failed: Multi-line code block content should be returned."
assert get_json_nofixup(
'```\n{"key": "value"}\n```') == '{"key": "value"}', "Failed: JSON object in code block with new lines should be correctly extracted and returned."
assert get_json_nofixup('') == invalid_json_str, "Failed: Empty string should lead to 'invalid json' return."
assert get_json_nofixup(
'True') == invalid_json_str, "Failed: Non-JSON 'True' value should lead to 'invalid json' return."
assert get_json_nofixup(
'{"incomplete": true,') == '{"incomplete": true,', "Failed: Incomplete JSON should still be considered as JSON and returned as is."
answer = """Here is an example JSON that fits the provided schema:
```json
{
"name": "John Doe",
"age": 30,
"skills": ["Java", "Python", "JavaScript"],
"work history": [
{
"company": "ABC Corp",
"duration": "2018-2020",
"position": "Software Engineer"
},
{
"company": "XYZ Inc",
"position": "Senior Software Engineer",
"duration": "2020-Present"
}
]
}
```
Note that the `work history` array contains two objects, each with a `company`, `duration`, and `position` property. The `skills` array contains three string elements, each with a maximum length of 10 characters. The `name` and `age` properties are also present and are of the correct data types."""
assert get_json_nofixup(answer) == """{
"name": "John Doe",
"age": 30,
"skills": ["Java", "Python", "JavaScript"],
"work history": [
{
"company": "ABC Corp",
"duration": "2018-2020",
"position": "Software Engineer"
},
{
"company": "XYZ Inc",
"position": "Senior Software Engineer",
"duration": "2020-Present"
}
]
}"""
# JSON within a code block
json_in_code_block = """
Here is an example JSON:
```json
{"key": "value"}
```
"""
# Plain JSON response
plain_json_response = '{"key": "value"}'
# Invalid JSON or non-JSON response
non_json_response = "This is just some text."
# Tests
assert get_json_nofixup(
json_in_code_block).strip() == '{"key": "value"}', "Should extract and return JSON from a code block."
assert get_json_nofixup(plain_json_response) == '{"key": "value"}', "Should return plain JSON as is."
assert get_json_nofixup(
non_json_response) == invalid_json_str, "Should return 'invalid json' for non-JSON response."
# Test with the provided example
stream_content = """ {\n \"name\": \"John Doe\",\n \"email\": \"john.doe@example.com\",\n \"jobTitle\": \"Software Developer\",\n \"department\": \"Technology\",\n \"hireDate\": \"2020-01-01\",\n \"employeeId\": 123456,\n \"manager\": {\n \"name\": \"Jane Smith\",\n \"email\": \"jane.smith@example.com\",\n \"jobTitle\": \"Senior Software Developer\"\n },\n \"skills\": [\n \"Java\",\n \"Python\",\n \"JavaScript\",\n \"React\",\n \"Spring\"\n ],\n \"education\": {\n \"degree\": \"Bachelor's Degree\",\n \"field\": \"Computer Science\",\n \"institution\": \"Example University\",\n \"graduationYear\": 2018\n },\n \"awards\": [\n {\n \"awardName\": \"Best Developer of the Year\",\n \"year\": 2021\n },\n {\n \"awardName\": \"Most Valuable Team Player\",\n \"year\": 2020\n }\n ],\n \"performanceRatings\": {\n \"communication\": 4.5,\n \"teamwork\": 4.8,\n \"creativity\": 4.2,\n \"problem-solving\": 4.6,\n \"technical skills\": 4.7\n }\n}\n```"""
extracted_content = get_json_nofixup(stream_content)
assert extracted_content == """{\n \"name\": \"John Doe\",\n \"email\": \"john.doe@example.com\",\n \"jobTitle\": \"Software Developer\",\n \"department\": \"Technology\",\n \"hireDate\": \"2020-01-01\",\n \"employeeId\": 123456,\n \"manager\": {\n \"name\": \"Jane Smith\",\n \"email\": \"jane.smith@example.com\",\n \"jobTitle\": \"Senior Software Developer\"\n },\n \"skills\": [\n \"Java\",\n \"Python\",\n \"JavaScript\",\n \"React\",\n \"Spring\"\n ],\n \"education\": {\n \"degree\": \"Bachelor's Degree\",\n \"field\": \"Computer Science\",\n \"institution\": \"Example University\",\n \"graduationYear\": 2018\n },\n \"awards\": [\n {\n \"awardName\": \"Best Developer of the Year\",\n \"year\": 2021\n },\n {\n \"awardName\": \"Most Valuable Team Player\",\n \"year\": 2020\n }\n ],\n \"performanceRatings\": {\n \"communication\": 4.5,\n \"teamwork\": 4.8,\n \"creativity\": 4.2,\n \"problem-solving\": 4.6,\n \"technical skills\": 4.7\n }\n}"""
def test_partial_codeblock2():
example_1 = "```code block starts immediately"
example_2 = "\n ```code block after newline and spaces"
example_3 = "
```code block after HTML line break"
example_4 = "This is a regular text without a code block."
assert has_starting_code_block(example_1)
assert has_starting_code_block(example_2)
assert has_starting_code_block(example_3)
assert not has_starting_code_block(example_4)
def test_extract_code_block_content():
example_stream_1 = "```code block content here```more text"
example_stream_2 = "```code block content with no end yet..."
example_stream_3 = "```\ncode block content here\n```\nmore text"
example_stream_4 = "```\ncode block content \nwith no end yet..."
example_stream_5 = "\n ```\ncode block content here\n```\nmore text"
example_stream_6 = "\n ```\ncode block content \nwith no end yet..."
example_stream_7 = "more text"
example_stream_8 = """```markdown
```json
{
"Employee": {
"Name": "Henry",
"Title": "AI Scientist",
"Department": "AI",
"Location": "San Francisco",
"Contact": {
"Email": "henryai@gmail.com",
"Phone": "+1-234-567-8901"
},
"Profile": {
"Education": [
{
"Institution": "Stanford University",
"Degree": "Ph.D.",
"Field": "Computer Science"
},
{
"Institution": "University of California, Berkeley",
"Degree": "M.S.",
"Field": "Artificial Intelligence"
}
],
"Experience": [
{
"Company": "Google",
"Role": "Senior AI Engineer",
"Duration": "5 years"
},
{
"Company": "Facebook",
"Role": "Principal AI Engineer",
"Duration": "3 years"
}
],
"Skills": [
"Python",
"TensorFlow",
"PyTorch",
"Natural Language Processing",
"Machine Learning"
],
"Languages": [
"English",
"French",
"Spanish"
],
"Certifications": [
{
"Name": "Certified AI Professional",
"Issuing Body": "AI Professional Association"
},
{
"Name": "Advanced AI Course Certificate",
"Issuing Body": "AI Institute"
}
]
}
}
}
```
"""
assert extract_code_block_content(example_stream_1) == "block content here"
assert extract_code_block_content(example_stream_2) == "block content with no end yet..."
assert extract_code_block_content(example_stream_3) == "code block content here"
assert extract_code_block_content(example_stream_4) == "code block content \nwith no end yet..."
assert extract_code_block_content(example_stream_5) == "code block content here"
assert extract_code_block_content(example_stream_6) == "code block content \nwith no end yet..."
assert extract_code_block_content(example_stream_7) == ""
expected8 = """{
"Employee": {
"Name": "Henry",
"Title": "AI Scientist",
"Department": "AI",
"Location": "San Francisco",
"Contact": {
"Email": "henryai@gmail.com",
"Phone": "+1-234-567-8901"
},
"Profile": {
"Education": [
{
"Institution": "Stanford University",
"Degree": "Ph.D.",
"Field": "Computer Science"
},
{
"Institution": "University of California, Berkeley",
"Degree": "M.S.",
"Field": "Artificial Intelligence"
}
],
"Experience": [
{
"Company": "Google",
"Role": "Senior AI Engineer",
"Duration": "5 years"
},
{
"Company": "Facebook",
"Role": "Principal AI Engineer",
"Duration": "3 years"
}
],
"Skills": [
"Python",
"TensorFlow",
"PyTorch",
"Natural Language Processing",
"Machine Learning"
],
"Languages": [
"English",
"French",
"Spanish"
],
"Certifications": [
{
"Name": "Certified AI Professional",
"Issuing Body": "AI Professional Association"
},
{
"Name": "Advanced AI Course Certificate",
"Issuing Body": "AI Institute"
}
]
}
}
}"""
assert extract_code_block_content(example_stream_8) == expected8
@pytest.mark.parametrize("method", ['repair_json', 'get_json'])
@wrap_test_forked
def test_repair_json(method):
a = """{
"Supplementary Leverage Ratio": [7.0, 5.8, 5.7],
"Liquidity Metrics": {
"End of Period Liabilities and Equity": [2260, 2362, 2291],
"Liquidity Coverage Ratio": [118, 115, 115],
"Trading-Related Liabilities(7)": [84, 72, 72],
"Total Available Liquidty Resources": [972, 994, 961],
"Deposits Balance Sheet": [140, 166, 164],
"Other Liabilities(7)": {},
"LTD": {},
"Equity": {
"Book Value per share": [86.43, 92.16, 92.21],
"Tangible Book Value per share": [73.67, 79.07, 79.16]
}
},
"Capital and Balance Sheet ($ in B)": {
"Risk-based Capital Metrics(1)": {
"End of Period Assets": [2260, 2362, 2291],
"CET1 Capital": [147, 150, 150],
"Standardized RWAs": [1222, 1284, 1224],
"Investments, net": {},
"CET1 Capital Ratio - Standardized": [12.1, 11.7, 12.2],
"Advanced RWAs": [1255, 1265, 1212],
"Trading-Related Assets(5)": [670, 681, 659],
"CET1 Capital Ratio - Advanced": [11.7, 11.8, 12.4],
"Loans, net(6)": {},
"Other(5)": [182, 210, 206]
}
}
}
Note: Totals may not sum due to rounding. LTD: Long-term debt. All information for 4Q21 is preliminary. All footnotes are presented on Slide 26."""
from json_repair import repair_json
for i in range(len(a)):
text = a[:i]
t0 = time.time()
if method == 'repair_json':
good_json_string = repair_json(text)
else:
good_json_string = get_json(text)
if i > 50:
assert len(good_json_string) > 5
tdelta = time.time() - t0
assert tdelta < 0.005, "Too slow: %s" % tdelta
print("%s : %s : %s" % (i, tdelta, good_json_string))
json.loads(good_json_string)
def test_json_repair_more():
response0 = """```markdown
```json
{
"Employee": {
"Name": "Henry",
"Title": "AI Scientist",
"Department": "AI",
"Location": "San Francisco",
"Contact": {
"Email": "henryai@gmail.com",
"Phone": "+1-234-567-8901"
},
"Profile": {
"Education": [
{
"Institution": "Stanford University",
"Degree": "Ph.D.",
"Field": "Computer Science"
},
{
"Institution": "University of California, Berkeley",
"Degree": "M.S.",
"Field": "Artificial Intelligence"
}
],
"Experience": [
{
"Company": "Google",
"Role": "Senior AI Engineer",
"Duration": "5 years"
},
{
"Company": "Facebook",
"Role": "Principal AI Engineer",
"Duration": "3 years"
}
],
"Skills": [
"Python",
"TensorFlow",
"PyTorch",
"Natural Language Processing",
"Machine Learning"
],
"Languages": [
"English",
"French",
"Spanish"
],
"Certifications": [
{
"Name": "Certified AI Professional",
"Issuing Body": "AI Professional Association"
},
{
"Name": "Advanced AI Course Certificate",
"Issuing Body": "AI Institute"
}
]
}
}
}
```
"""
from json_repair import repair_json
response = repair_json(response0)
assert response.startswith('{')
response0 = """ Here is an example employee profile in JSON format, with keys that are less than 64 characters and made of only alphanumerics, underscores, or hyphens:
```json
{
"employee_id": 1234,
"name": "John Doe",
"email": "johndoe@example.com",
"job_title": "Software Engineer",
"department": "Engineering",
"hire_date": "2020-01-01",
"salary": 100000,
"manager_id": 5678
}
```
In Markdown, you can display this JSON code block like this:
```json
```
{
"employee_id": 1234,
"name": "John Doe",
"email": "johndoe@example.com",
"job_title": "Software Engineer",
"department": "Engineering",
"hire_date": "2020-01-01",
"salary": 100000,
"manager_id": 5678
}
```
This will display the JSON code block with proper formatting and highlighting.
"""
# from json_repair import repair_json
from src.utils import get_json, repair_json_by_type
import json
response = repair_json_by_type(response0)
assert json.loads(response)['employee_id'] == 1234
print(response)
response = get_json(response0, json_schema_type='object')
assert json.loads(response)['employee_id'] == 1234
print(response)
@wrap_test_forked
def test_dedup():
# Example usage:
names_list = ['Alice', 'Bob', 'Alice', 'Charlie', 'Bob', 'Alice']
assert deduplicate_names(names_list) == ['Alice', 'Bob', 'Alice_1', 'Charlie', 'Bob_1', 'Alice_2']
# Test cases
def test_handle_json_normal():
normal_json = {
"name": "Henry",
"age": 35,
"skills": ["AI", "Machine Learning", "Data Science"],
"workhistory": [
{"company": "TechCorp", "duration": "2015-2020", "position": "Senior AI Scientist"},
{"company": "AI Solutions", "duration": "2010-2015", "position": "AI Scientist"}
]
}
assert handle_json(normal_json) == normal_json
def test_handle_json_schema():
schema_json = {
"name": {"type": "string", "value": "Henry"},
"age": {"type": "integer", "value": 35},
"skills": {"type": "array", "items": [
{"type": "string", "value": "AI", "maxLength": 10},
{"type": "string", "value": "Machine Learning", "maxLength": 10},
{"type": "string", "value": "Data Science", "maxLength": 10}
], "minItems": 3},
"workhistory": {"type": "array", "items": [
{"type": "object", "properties": {
"company": {"type": "string", "value": "TechCorp"},
"duration": {"type": "string", "value": "2015-2020"},
"position": {"type": "string", "value": "Senior AI Scientist"}
}, "required": ["company", "position"]},
{"type": "object", "properties": {
"company": {"type": "string", "value": "AI Solutions"},
"duration": {"type": "string", "value": "2010-2015"},
"position": {"type": "string", "value": "AI Scientist"}
}, "required": ["company", "position"]}
]}
}
expected_result = {
"name": "Henry",
"age": 35,
"skills": ["AI", "Machine Learning", "Data Science"],
"workhistory": [
{"company": "TechCorp", "duration": "2015-2020", "position": "Senior AI Scientist"},
{"company": "AI Solutions", "duration": "2010-2015", "position": "AI Scientist"}
]
}
assert handle_json(schema_json) == expected_result
def test_handle_json_mixed():
mixed_json = {
"name": "Henry",
"age": {"type": "integer", "value": 35},
"skills": ["AI", {"type": "string", "value": "Machine Learning"}, "Data Science"],
"workhistory": {"type": "array", "items": [
{"type": "object", "properties": {
"company": {"type": "string", "value": "TechCorp"},
"duration": {"type": "string", "value": "2015-2020"},
"position": {"type": "string", "value": "Senior AI Scientist"}
}, "required": ["company", "position"]},
{"company": "AI Solutions", "duration": "2010-2015", "position": "AI Scientist"}
]}
}
expected_result = {
"name": "Henry",
"age": 35,
"skills": ["AI", "Machine Learning", "Data Science"],
"workhistory": [
{"company": "TechCorp", "duration": "2015-2020", "position": "Senior AI Scientist"},
{"company": "AI Solutions", "duration": "2010-2015", "position": "AI Scientist"}
]
}
assert handle_json(mixed_json) == expected_result
def test_handle_json_empty():
empty_json = {}
assert handle_json(empty_json) == empty_json
def test_handle_json_no_schema():
no_schema_json = {
"name": {"first": "Henry", "last": "Smith"},
"age": 35,
"skills": ["AI", "Machine Learning", "Data Science"]
}
assert handle_json(no_schema_json) == no_schema_json
def test_json_repair_on_string():
from json_repair import repair_json
response0 = 'According to the information provided, the best safety assessment enum label is "Safe".'
json_schema_type = 'object'
response = get_json(response0, json_schema_type=json_schema_type)
response = json.loads(response)
assert isinstance(response, dict) and not response
response = repair_json(response0)
assert isinstance(response, str) and response in ['""', """''""", '', None]
# Example usage converted to pytest test cases
def test_check_input_type():
# Valid URL
assert check_input_type("https://example.com") == 'url'
# Valid file path (Note: Adjust the path to match an actual file on your system for the test to pass)
assert check_input_type("tests/receipt.jpg") == 'file'
# Valid base64 encoded image
assert check_input_type("b'data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...") == 'base64'
# Non-string inputs
assert check_input_type(b"bytes data") == 'unknown'
assert check_input_type(12345) == 'unknown'
assert check_input_type(["list", "of", "strings"]) == 'unknown'
# Invalid URL
assert check_input_type("invalid://example.com") == 'unknown'
# Invalid file path
assert check_input_type("/path/to/invalid/file.txt") == 'unknown'
# Plain string
assert check_input_type("just a string") == 'unknown'
def test_process_file_list():
# Create a list of test files
test_files = [
"tests/videotest.mp4",
"tests/dental.png",
"tests/fastfood.jpg",
"tests/ocr2.png",
"tests/receipt.jpg",
"tests/revenue.png",
"tests/jon.png",
"tests/ocr1.png",
"tests/ocr3.png",
"tests/screenshot.png",
]
output_dir = os.path.join(tempfile.gettempdir(), 'image_path_%s' % str(uuid.uuid4()))
print(output_dir, file=sys.stderr)
# Process the files
processed_files = process_file_list(test_files, output_dir, resolution=(640, 480), image_format="jpg", verbose=True)
# Print the resulting list of image files
print("Processed files:")
for file in processed_files:
print(file, file=sys.stderr)
assert os.path.isfile(file)
assert len(processed_files) == len(
test_files) - 1 + 17 + 4 # 17 is the number of images generated from the video file
def test_process_file_list_extract_frames():
# Create a list of test files
test_files = [
"tests/videotest.mp4",
"tests/dental.png",
"tests/fastfood.jpg",
"tests/ocr2.png",
"tests/receipt.jpg",
"tests/revenue.png",
"tests/jon.png",
"tests/ocr1.png",
"tests/ocr3.png",
"tests/screenshot.png",
]
output_dir = os.path.join(tempfile.gettempdir(), 'image_path_%s' % str(uuid.uuid4()))
print(output_dir, file=sys.stderr)
# Process the files
processed_files = process_file_list(test_files, output_dir, resolution=(640, 480), image_format="jpg",
video_frame_period=0, extract_frames=10, verbose=True)
# Print the resulting list of image files
print("Processed files:")
for file in processed_files:
print(file, file=sys.stderr)
assert os.path.isfile(file)
assert len(processed_files) == len(test_files) - 1 + 10 # 10 is the number of images generated from the video file
def test_process_youtube():
# Create a list of test files
test_files = [
"https://www.youtube.com/shorts/fRkZCriQQNU",
"tests/screenshot.png"
]
output_dir = os.path.join(tempfile.gettempdir(), 'image_path_%s' % str(uuid.uuid4()))
print(output_dir, file=sys.stderr)
# Process the files
processed_files = process_file_list(test_files, output_dir, resolution=(640, 480), image_format="jpg",
video_frame_period=0, extract_frames=10, verbose=True)
# Print the resulting list of image files
print("Processed files:")
for file in processed_files:
print(file, file=sys.stderr)
assert os.path.isfile(file)
assert len(processed_files) == len(test_files) - 1 + 10 # 10 is the number of images generated from the video file
def test_process_animated_gif():
# Create a list of test files
test_files = [
"tests/test_animated_gif.gif",
"tests/screenshot.png",
]
output_dir = os.path.join(tempfile.gettempdir(), 'image_path_%s' % str(uuid.uuid4()))
print(output_dir, file=sys.stderr)
# Process the files
processed_files = process_file_list(test_files, output_dir, resolution=(640, 480), image_format="jpg",
video_frame_period=0, extract_frames=10, verbose=True)
# Print the resulting list of image files
print("Processed files:")
for file in processed_files:
print(file, file=sys.stderr)
assert os.path.isfile(file)
assert len(processed_files) == len(test_files) - 1 + 3 # 3 is the number of images generated from the animated gif
def test_process_animated_gif2():
# Create a list of test files
test_files = [
"tests/test_animated_gif.gif",
"tests/screenshot.png"
]
output_dir = os.path.join(tempfile.gettempdir(), 'image_path_%s' % str(uuid.uuid4()))
print(output_dir, file=sys.stderr)
# Process the files
processed_files = process_file_list(test_files, output_dir, verbose=True)
# Print the resulting list of image files
print("Processed files:")
for file in processed_files:
print(file, file=sys.stderr)
assert os.path.isfile(file)
assert len(processed_files) == len(test_files) - 1 + 3 # 3 is the number of images generated from the animated gif
def test_process_animated_gif3():
# Create a list of test files
test_files = [
"tests/test_animated_gif.gif",
"tests/screenshot.png"
]
output_dir = os.path.join(tempfile.gettempdir(), 'image_path_%s' % str(uuid.uuid4()))
print(output_dir, file=sys.stderr)
# Process the files
processed_files = process_file_list(test_files, output_dir, video_frame_period=1, verbose=True)
# Print the resulting list of image files
print("Processed files:")
for file in processed_files:
print(file, file=sys.stderr)
assert os.path.isfile(file)
assert len(processed_files) == len(
test_files) - 1 + 60 # 60 is the number of images generated from the animated gif
def test_process_mixed():
# Create a list of test files
test_files = [
"tests/videotest.mp4",
"https://www.youtube.com/shorts/fRkZCriQQNU",
"tests/screenshot.png",
"tests/test_animated_gif.gif",
]
output_dir = os.path.join(tempfile.gettempdir(), 'image_path_%s' % str(uuid.uuid4()))
print(output_dir, file=sys.stderr)
# Process the files
processed_files = process_file_list(test_files, output_dir, resolution=(640, 480), image_format="jpg",
video_frame_period=0, extract_frames=10, verbose=True)
# Print the resulting list of image files
print("Processed files:")
for file in processed_files:
print(file, file=sys.stderr)
assert os.path.isfile(file)
assert len(processed_files) == len(test_files) - 1 + 29 # 28 is the number of images generated from the video files
def test_update_db():
auth_filename = "test.db"
remove(auth_filename)
from src.db_utils import fetch_user
assert fetch_user(auth_filename, '', verbose=True) == {}
username = "jon"
updates = {
"selection_docs_state": {
"langchain_modes": ["NewMode1"],
"langchain_mode_paths": {"NewMode1": "new_mode_path1"},
"langchain_mode_types": {"NewMode1": "shared"}
}
}
from src.db_utils import append_to_user_data
append_to_user_data(auth_filename, username, updates, verbose=True)
auth_dict = fetch_user(auth_filename, username, verbose=True)
assert auth_dict == {'jon': {'selection_docs_state': {'langchain_mode_paths': {'NewMode1': 'new_mode_path1'},
'langchain_mode_types': {'NewMode1': 'shared'},
'langchain_modes': ['NewMode1']}}}
updates = {
"selection_docs_state": {
"langchain_modes": ["NewMode"],
"langchain_mode_paths": {"NewMode": "new_mode_path"},
"langchain_mode_types": {"NewMode": "shared"}
}
}
from src.db_utils import append_to_users_data
append_to_users_data(auth_filename, updates, verbose=True)
auth_dict = fetch_user(auth_filename, username, verbose=True)
assert auth_dict == {'jon': {'selection_docs_state':
{'langchain_mode_paths': {'NewMode1': 'new_mode_path1',
"NewMode": "new_mode_path"},
'langchain_mode_types': {'NewMode1': 'shared', "NewMode": "shared"},
'langchain_modes': ['NewMode1', 'NewMode']}}}
def test_encode_chat_template():
jinja_template = """
{{ bos_token }}
{%- if messages[0]['role'] == 'system' -%}
{% set system_message = messages[0]['content'].strip() %}
{% set loop_messages = messages[1:] %}
{%- else -%}
{% set system_message = 'This is a chat between a user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user\'s questions based on the context. The assistant should also indicate when the answer cannot be found in the context.' %}
{% set loop_messages = messages %}
{%- endif -%}
System: {{ system_message }}
{% for message in loop_messages %}
{%- if message['role'] == 'user' -%}
User: {{ message['content'].strip() + '\n' }}
{%- else -%}
Assistant: {{ message['content'].strip() + '\n' }}
{%- endif %}
{% if loop.last and message['role'] == 'user' %}
Assistant:
{% endif %}
{% endfor %}
"""
encoded_template = base64_encode_jinja_template(jinja_template)
print("\nEncoded Template:", encoded_template)
model_lock_option = f"""--model_lock="[{{'inference_server': 'vllm_chat:149.130.210.116', 'base_model': 'nvidia/Llama3-ChatQA-1.5-70B', 'visible_models': 'nvidia/Llama3-ChatQA-1.5-70B', 'h2ogpt_key': '62224bfb-c832-4452-81e7-8a4bdabbe164', 'chat_template': '{encoded_template}'}}]"
"""
print("Command-Line Option:")
print(model_lock_option)
# Example of decoding back from the command-line option
command_line_option = model_lock_option.strip('--model_lock=')
# double ast.literal_eval due to quoted quote for model_lock_option
parsed_model_lock_option = ast.literal_eval(ast.literal_eval(command_line_option))
encoded_template_from_option = parsed_model_lock_option[0]['chat_template']
decoded_template = base64_decode_jinja_template(encoded_template_from_option)
print("Decoded Template:")
print(decoded_template)
assert jinja_template == decoded_template
def test_depth():
example_list = [[['Dog', ['/tmp/gradio/image_Dog_d2b19221_6f70_4987_bda8_09be952eae93.png']],
['Who are you?', ['/tmp/gradio/image_Wh_480bd8318d01b570b61e77a9306aef87_c41f.png']],
['Who ar eyou?',
"I apologize for the confusion earlier!\n\nI am LLaMA, an AI assistant developed by Meta AI that can understand and respond to human input in a conversational manner. I'm not a human, but a computer program designed to simulate conversation, answer questions, and even generate text based on the input I receive.\n\nI can assist with a wide range of topics, from general knowledge to entertainment, and even create stories or dialogues. I'm constantly learning and improving my responses based on the interactions I have with users like you.\n\nSo, feel free to ask me anything, and I'll do my best to help!"]],
[], [], [], [], [], [], [], [], [], [], []]
assert get_gradio_depth(example_list) == 3
example_list = [[[['Dog'], ['/tmp/gradio/image_Dog_d2b19221_6f70_4987_bda8_09be952eae93.png']],
['Who are you?', ['/tmp/gradio/image_Wh_480bd8318d01b570b61e77a9306aef87_c41f.png']],
['Who ar eyou?',
"I apologize for the confusion earlier!\n\nI am LLaMA, an AI assistant developed by Meta AI that can understand and respond to human input in a conversational manner. I'm not a human, but a computer program designed to simulate conversation, answer questions, and even generate text based on the input I receive.\n\nI can assist with a wide range of topics, from general knowledge to entertainment, and even create stories or dialogues. I'm constantly learning and improving my responses based on the interactions I have with users like you.\n\nSo, feel free to ask me anything, and I'll do my best to help!"]],
[], [], [], [], [], [], [], [], [], [], []]
assert get_gradio_depth(example_list) == 3
example_list = [[['Dog', "Bad Dog"], ['Who are you?', "Image"], ['Who ar eyou?',
"I apologize for the confusion earlier!\n\nI am LLaMA, an AI assistant developed by Meta AI that can understand and respond to human input in a conversational manner. I'm not a human, but a computer program designed to simulate conversation, answer questions, and even generate text based on the input I receive.\n\nI can assist with a wide range of topics, from general knowledge to entertainment, and even create stories or dialogues. I'm constantly learning and improving my responses based on the interactions I have with users like you.\n\nSo, feel free to ask me anything, and I'll do my best to help!"]],
[], [], [], [], [], [], [], [], [], [], []]
assert get_gradio_depth(example_list) == 3
example_list = [[[['Dog', "Bad Dog"], ['Who are you?', "Image"], ['Who ar eyou?',
"I apologize for the confusion earlier!\n\nI am LLaMA, an AI assistant developed by Meta AI that can understand and respond to human input in a conversational manner. I'm not a human, but a computer program designed to simulate conversation, answer questions, and even generate text based on the input I receive.\n\nI can assist with a wide range of topics, from general knowledge to entertainment, and even create stories or dialogues. I'm constantly learning and improving my responses based on the interactions I have with users like you.\n\nSo, feel free to ask me anything, and I'll do my best to help!"]],
[], [], [], [], [], [], [], [], [], [], []]]
assert get_gradio_depth(example_list) == 4
example_list = [['Dog', "Bad Dog"], ['Who are you?', "Image"]]
assert get_gradio_depth(example_list) == 2
# more cases
example_list = []
assert get_gradio_depth(example_list) == 0
example_list = [1, 2, 3]
assert get_gradio_depth(example_list) == 1
example_list = [[1], [2], [3]]
assert get_gradio_depth(example_list) == 1
example_list = [[[1]], [[2]], [[3]]]
assert get_gradio_depth(example_list) == 2
example_list = [[[[1]]], [[[2]]], [[[3]]]]
assert get_gradio_depth(example_list) == 3
example_list = [[[[[1]]]], [[[[2]]]], [[[[3]]]]]
assert get_gradio_depth(example_list) == 4
example_list = [[], [1], [2, [3]], [[[4]]]]
assert get_gradio_depth(example_list) == 3
example_list = [[], [[[[1]]]], [2, [3]], [[[4]]]]
assert get_gradio_depth(example_list) == 4
example_list = [[], [[[[[1]]]]], [2, [3]], [[[4]]]]
assert get_gradio_depth(example_list) == 5
example_list = [[[[[1]]]], [[[[2]]]], [[[3]]], [[4]], [5]]
assert get_gradio_depth(example_list) == 4
example_list = [[[[[1]]]], [[[[2]]]], [[[3]]], [[4]], [5], []]
assert get_gradio_depth(example_list) == 4
def test_schema_to_typed():
TEST_SCHEMA = {
"type": "object",
"properties": {
"name": {"type": "string"},
"age": {"type": "integer"},
"skills": {
"type": "array",
"items": {"type": "string", "maxLength": 10},
"minItems": 3
},
"work history": {
"type": "array",
"items": {
"type": "object",
"properties": {
"company": {"type": "string"},
"duration": {"type": "string"},
"position": {"type": "string"}
},
"required": ["company", "position"]
}
}
},
"required": ["name", "age", "skills", "work history"]
}
Schema = create_typed_dict(TEST_SCHEMA)
# Example usage of the generated TypedDict
person: Schema = {
"name": "John Doe",
"age": 30,
"skills": ["Python", "TypeScript", "Docker"],
"work history": [
{"company": "TechCorp", "position": "Developer", "duration": "2 years"},
{"company": "DataInc", "position": "Data Scientist"}
]
}
print(person)
def test_genai_schema():
# Usage example
TEST_SCHEMA = {
"type": "object",
"properties": {
"name": {"type": "string"},
"age": {"type": "integer"},
"skills": {
"type": "array",
"items": {"type": "string", "maxLength": 10},
"minItems": 3
},
"work history": {
"type": "array",
"items": {
"type": "object",
"properties": {
"company": {"type": "string"},
"duration": {"type": "string"},
"position": {"type": "string"}
},
"required": ["company", "position"]
}
},
"status": {
"type": "string",
"enum": ["active", "inactive", "on leave"]
}
},
"required": ["name", "age", "skills", "work history", "status"]
}
from src.utils_langchain import convert_to_genai_schema
genai_schema = convert_to_genai_schema(TEST_SCHEMA)
# Print the schema (this will show the structure, but not all details)
print(genai_schema)
# You can now use this schema with the Gemini API
# For example:
# response = model.generate_content(prompt, response_schema=genai_schema)
def test_genai_schema_more():
# Test cases
TEST_SCHEMAS = [
# Object schema
{
"type": "object",
"properties": {
"name": {"type": "string", "description": "The person's name"},
"age": {"type": "integer", "description": "The person's age"},
"height": {"type": "number", "format": "float", "description": "Height in meters"},
"is_student": {"type": "boolean", "description": "Whether the person is a student"},
"skills": {
"type": "array",
"items": {"type": "string"},
"description": "List of skills"
},
"address": {
"type": "object",
"properties": {
"street": {"type": "string"},
"city": {"type": "string"},
"country": {"type": "string"}
},
"required": ["street", "city"],
"description": "Address details"
},
"status": {
"type": "string",
"enum": ["active", "inactive", "on leave"],
"description": "Current status"
}
},
"required": ["name", "age", "is_student"],
"description": "A person's profile"
},
# Array schema
{
"type": "array",
"items": {
"type": "object",
"properties": {
"id": {"type": "integer"},
"name": {"type": "string"}
},
"required": ["id"]
},
"description": "List of items"
},
# String schema
{
"type": "string",
"format": "email",
"description": "Email address"
},
# Number schema
{
"type": "number",
"format": "double",
"description": "A floating-point number"
},
# Boolean schema
{
"type": "boolean",
"description": "A true/false value"
}
]
from src.utils_langchain import convert_to_genai_schema
# Test the conversion
for i, schema in enumerate(TEST_SCHEMAS, 1):
print(f"\nTest Schema {i}:")
genai_schema = convert_to_genai_schema(schema)
print(genai_schema)
def test_pymupdf4llm():
from langchain_community.document_loaders import PyMuPDFLoader
from src.utils_langchain import PyMuPDF4LLMLoader
times_pymupdf = []
times_pymupdf4llm = []
files = [os.path.join('tests', x) for x in os.listdir('tests')]
files += [os.path.join('/home/jon/Downloads/', x) for x in os.listdir('/home/jon/Downloads/')]
files = ['/home/jon/Downloads/Tabasco_Ingredients_Products_Guide.pdf']
for file in files:
if not file.endswith('.pdf'):
continue
t0 = time.time()
doc = PyMuPDFLoader(file).load()
assert doc is not None
print('pymupdf: %s %s %s' % (file, len(doc), time.time() - t0))
times_pymupdf.append((time.time() - t0)/len(doc))
for page in doc:
print(page)
t0 = time.time()
doc = PyMuPDF4LLMLoader(file).load()
assert doc is not None
print('pymupdf4llm: %s %s %s' % (file, len(doc), time.time() - t0))
times_pymupdf4llm.append((time.time() - t0)/len(doc))
for page in doc:
print(page)
if len(times_pymupdf) > 30:
break
print("pymupdf stats:")
compute_stats(times_pymupdf)
print("pymupdf4llm stats:")
compute_stats(times_pymupdf4llm)
def compute_stats(times_in_seconds):
# Compute statistics
min_time = min(times_in_seconds)
max_time = max(times_in_seconds)
average_time = sum(times_in_seconds) / len(times_in_seconds)
# Print the results
print(f"Min time: {min_time} seconds")
print(f"Max time: {max_time} seconds")
print(f"Average time: {average_time} seconds")