{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import sys\n", "import torch\n", "import json\n", "from chemietoolkit import ChemIEToolkit\n", "import cv2\n", "from PIL import Image\n", "import json\n", "model = ChemIEToolkit(device=torch.device('cpu')) \n", "from get_molecular_agent import process_reaction_image_with_multiple_products_and_text\n", "from get_reaction_agent import get_reaction_withatoms\n", "from get_reaction_agent import get_full_reaction\n", "\n", "\n", "# 定义函数,接受多个图像路径并返回反应列表\n", "def get_multi_molecular(image_path: str) -> list:\n", " '''Returns a list of reactions extracted from the image.'''\n", " # 打开图像文件\n", " image = Image.open(image_path).convert('RGB')\n", " \n", " # 将图像作为输入传递给模型\n", " coref_results = model.extract_molecule_corefs_from_figures([image])\n", " \n", " for item in coref_results:\n", " for bbox in item.get(\"bboxes\", []):\n", " for key in [\"category\", \"molfile\", \"symbols\", 'atoms', \"bonds\", 'category_id', 'score', 'corefs',\"coords\",\"edges\"]: #'atoms'\n", " bbox.pop(key, None) # 安全地移除键\n", " print(json.dumps(coref_results))\n", " # 返回反应列表,使用 json.dumps 进行格式化\n", " \n", " return json.dumps(coref_results)\n", "\n", "def get_multi_molecular_text_to_correct(image_path: str) -> list:\n", " '''Returns a list of reactions extracted from the image.'''\n", " # 打开图像文件\n", " image = Image.open(image_path).convert('RGB')\n", " \n", " # 将图像作为输入传递给模型\n", " coref_results = model.extract_molecule_corefs_from_figures([image])\n", " #coref_results = process_reaction_image_with_multiple_products_and_text(image_path)\n", " for item in coref_results:\n", " for bbox in item.get(\"bboxes\", []):\n", " for key in [\"category\", \"bbox\", \"molfile\", \"symbols\", 'atoms', \"bonds\", 'category_id', 'score', 'corefs',\"coords\",\"edges\"]: #'atoms'\n", " bbox.pop(key, None) # 安全地移除键\n", " print(json.dumps(coref_results))\n", " # 返回反应列表,使用 json.dumps 进行格式化\n", " \n", " return json.dumps(coref_results)\n", "\n", "def get_multi_molecular_text_to_correct_withatoms(image_path: str) -> list:\n", " '''Returns a list of reactions extracted from the image.'''\n", " # 打开图像文件\n", " image = Image.open(image_path).convert('RGB')\n", " \n", " # 将图像作为输入传递给模型\n", " #coref_results = model.extract_molecule_corefs_from_figures([image])\n", " coref_results = process_reaction_image_with_multiple_products_and_text(image_path)\n", " for item in coref_results:\n", " for bbox in item.get(\"bboxes\", []):\n", " for key in [\"molfile\", 'atoms', \"bonds\", 'category_id', 'score', 'corefs',\"coords\",\"edges\"]: #'atoms'\n", " bbox.pop(key, None) # 安全地移除键\n", " print(json.dumps(coref_results))\n", " # 返回反应列表,使用 json.dumps 进行格式化\n", " return json.dumps(coref_results)\n", "\n", "#get_multi_molecular_text_to_correct('./acs.joc.2c00176 example 1.png')\n", "\n", "import sys\n", "#sys.path.append('./RxnScribe-main/')\n", "import torch\n", "from rxnscribe import RxnScribe\n", "import json\n", "\n", "ckpt_path = \"./pix2seq_reaction_full.ckpt\"\n", "model1 = RxnScribe(ckpt_path, device=torch.device('cpu'))\n", "device = torch.device('cpu')\n", "\n", "def get_reaction(image_path: str) -> dict:\n", " '''\n", " Returns a structured dictionary of reactions extracted from the image,\n", " including reactants, conditions, and products, with their smiles, text, and bbox.\n", " '''\n", " image_file = image_path\n", " #raw_prediction = model1.predict_image_file(image_file, molscribe=True, ocr=True)\n", " raw_prediction = get_reaction_withatoms(image_path)\n", "\n", " # Ensure raw_prediction is treated as a list directly\n", " structured_output = {}\n", " for section_key in ['reactants', 'conditions', 'products']:\n", " if section_key in raw_prediction[0]:\n", " structured_output[section_key] = []\n", " for item in raw_prediction[0][section_key]:\n", " if section_key in ['reactants', 'products']:\n", " # Extract smiles and bbox for molecules\n", " structured_output[section_key].append({\n", " \"smiles\": item.get(\"smiles\", \"\"),\n", " \"bbox\": item.get(\"bbox\", [])\n", " })\n", " elif section_key == 'conditions':\n", " # Extract smiles, text, and bbox for conditions\n", " condition_data = {\"bbox\": item.get(\"bbox\", [])}\n", " if \"smiles\" in item:\n", " condition_data[\"smiles\"] = item.get(\"smiles\", \"\")\n", " if \"text\" in item:\n", " condition_data[\"text\"] = item.get(\"text\", [])\n", " structured_output[section_key].append(condition_data)\n", " print(f\"structured_output:{structured_output}\")\n", "\n", " return structured_output\n", "\n", "\n", "\n", "\n", "import base64\n", "import torch\n", "import json\n", "from PIL import Image\n", "import numpy as np\n", "from chemietoolkit import ChemIEToolkit, utils\n", "from openai import AzureOpenAI\n", "\n", "def process_reaction_image_with_multiple_products(image_path: str) -> dict:\n", " \"\"\"\n", " Args:\n", " image_path (str): 图像文件路径。\n", "\n", " Returns:\n", " dict: 整理后的反应数据,包括反应物、产物和反应模板。\n", " \"\"\"\n", " # 配置 API Key 和 Azure Endpoint\n", " api_key = \"b038da96509b4009be931e035435e022\" # 替换为实际的 API Key\n", " azure_endpoint = \"https://hkust.azure-api.net\" # 替换为实际的 Azure Endpoint\n", " \n", "\n", " model = ChemIEToolkit(device=torch.device('cpu'))\n", " client = AzureOpenAI(\n", " api_key=api_key,\n", " api_version='2024-06-01',\n", " azure_endpoint=azure_endpoint\n", " )\n", "\n", " # 加载图像并编码为 Base64\n", " def encode_image(image_path: str):\n", " with open(image_path, \"rb\") as image_file:\n", " return base64.b64encode(image_file.read()).decode('utf-8')\n", "\n", " base64_image = encode_image(image_path)\n", "\n", " # GPT 工具调用配置\n", " tools = [\n", " {\n", " 'type': 'function',\n", " 'function': {\n", " 'name': 'get_multi_molecular_text_to_correct',\n", " 'description': 'Extracts the SMILES string and text coref from molecular images.',\n", " 'parameters': {\n", " 'type': 'object',\n", " 'properties': {\n", " 'image_path': {\n", " 'type': 'string',\n", " 'description': 'Path to the reaction image.'\n", " }\n", " },\n", " 'required': ['image_path'],\n", " 'additionalProperties': False\n", " }\n", " }\n", " },\n", " {\n", " 'type': 'function',\n", " 'function': {\n", " 'name': 'get_reaction',\n", " 'description': 'Get a list of reactions from a reaction image. A reaction contains data of the reactants, conditions, and products.',\n", " 'parameters': {\n", " 'type': 'object',\n", " 'properties': {\n", " 'image_path': {\n", " 'type': 'string',\n", " 'description': 'The path to the reaction image.',\n", " },\n", " },\n", " 'required': ['image_path'],\n", " 'additionalProperties': False,\n", " },\n", " },\n", " },\n", " ]\n", "\n", " # 提供给 GPT 的消息内容\n", " with open('./prompt.txt', 'r') as prompt_file:\n", " prompt = prompt_file.read()\n", " messages = [\n", " {'role': 'system', 'content': 'You are a helpful assistant.'},\n", " {\n", " 'role': 'user',\n", " 'content': [\n", " {'type': 'text', 'text': prompt},\n", " {'type': 'image_url', 'image_url': {'url': f'data:image/png;base64,{base64_image}'}}\n", " ]\n", " }\n", " ]\n", "\n", " # 调用 GPT 接口\n", " response = client.chat.completions.create(\n", " model = 'gpt-4o',\n", " temperature = 0,\n", " response_format={ 'type': 'json_object' },\n", " messages = [\n", " {'role': 'system', 'content': 'You are a helpful assistant.'},\n", " {\n", " 'role': 'user',\n", " 'content': [\n", " {\n", " 'type': 'text',\n", " 'text': prompt\n", " },\n", " {\n", " 'type': 'image_url',\n", " 'image_url': {\n", " 'url': f'data:image/png;base64,{base64_image}'\n", " }\n", " }\n", " ]},\n", " ],\n", " tools = tools)\n", " \n", "# Step 1: 工具映射表\n", " TOOL_MAP = {\n", " 'get_multi_molecular_text_to_correct': get_multi_molecular_text_to_correct,\n", " 'get_reaction': get_reaction\n", " }\n", "\n", " # Step 2: 处理多个工具调用\n", " tool_calls = response.choices[0].message.tool_calls\n", " results = []\n", "\n", " # 遍历每个工具调用\n", " for tool_call in tool_calls:\n", " tool_name = tool_call.function.name\n", " tool_arguments = tool_call.function.arguments\n", " tool_call_id = tool_call.id\n", " \n", " tool_args = json.loads(tool_arguments)\n", " \n", " if tool_name in TOOL_MAP:\n", " # 调用工具并获取结果\n", " tool_result = TOOL_MAP[tool_name](image_path)\n", " else:\n", " raise ValueError(f\"Unknown tool called: {tool_name}\")\n", " \n", " # 保存每个工具调用结果\n", " results.append({\n", " 'role': 'tool',\n", " 'content': json.dumps({\n", " 'image_path': image_path,\n", " f'{tool_name}':(tool_result),\n", " }),\n", " 'tool_call_id': tool_call_id,\n", " })\n", "\n", "\n", "# Prepare the chat completion payload\n", " completion_payload = {\n", " 'model': 'gpt-4o',\n", " 'messages': [\n", " {'role': 'system', 'content': 'You are a helpful assistant.'},\n", " {\n", " 'role': 'user',\n", " 'content': [\n", " {\n", " 'type': 'text',\n", " 'text': prompt\n", " },\n", " {\n", " 'type': 'image_url',\n", " 'image_url': {\n", " 'url': f'data:image/png;base64,{base64_image}'\n", " }\n", " }\n", " ]\n", " },\n", " response.choices[0].message,\n", " *results\n", " ],\n", " }\n", "\n", "# Generate new response\n", " response = client.chat.completions.create(\n", " model=completion_payload[\"model\"],\n", " messages=completion_payload[\"messages\"],\n", " response_format={ 'type': 'json_object' },\n", " temperature=0\n", " )\n", "\n", "\n", " \n", " # 获取 GPT 生成的结果\n", " gpt_output = json.loads(response.choices[0].message.content)\n", " print(f\"gptout:{gpt_output}\")\n", "\n", " image = Image.open(image_path).convert('RGB')\n", " image_np = np.array(image)\n", "\n", " #########################\n", " #reaction_results = model.extract_reactions_from_figures([image_np])\n", " reaction_results = get_reaction_withatoms(image_path)[0]\n", " reactions = []\n", " \n", " # 将 reactants 和 products 转换为 reactions\n", " for reactants, conditions, products in zip(reaction_results.get('reactants', []), reaction_results.get('conditions', []), reaction_results.get('products', [])):\n", " reaction = {\n", " \"reactants\": [reactants],\n", " \"conditions\": [conditions],\n", " \"products\": [products]\n", " }\n", " reactions.append(reaction)\n", " reaction_results = [{\"reactions\": reactions}]\n", " #coref_results = model.extract_molecule_corefs_from_figures([image_np])\n", " coref_results = process_reaction_image_with_multiple_products_and_text(image_path)\n", " ########################\n", "\n", " # 定义更新工具输出的函数\n", " def extract_smiles_details(smiles_data, raw_details):\n", " smiles_details = {}\n", " for smiles in smiles_data:\n", " for detail in raw_details:\n", " for bbox in detail.get('bboxes', []):\n", " if bbox.get('smiles') == smiles:\n", " smiles_details[smiles] = {\n", " 'category': bbox.get('category'),\n", " 'bbox': bbox.get('bbox'),\n", " 'category_id': bbox.get('category_id'),\n", " 'score': bbox.get('score'),\n", " 'molfile': bbox.get('molfile'),\n", " 'atoms': bbox.get('atoms'),\n", " 'bonds': bbox.get('bonds')\n", " }\n", " break\n", " return smiles_details\n", "\n", "# 获取结果\n", " smiles_details = extract_smiles_details(gpt_output, coref_results)\n", "\n", " reactants_array = []\n", " products = []\n", "\n", " for reactant in reaction_results[0]['reactions'][0]['reactants']:\n", " #for reactant in reaction_results[0]['reactions'][0]['reactants']:\n", " if 'smiles' in reactant:\n", " #print(reactant['smiles'])\n", " #print(reactant)\n", " reactants_array.append(reactant['smiles'])\n", "\n", " for product in reaction_results[0]['reactions'][0]['products']:\n", " #print(product['smiles'])\n", " #print(product)\n", " products.append(product['smiles'])\n", " # 输出结果\n", " #import pprint\n", " #pprint.pprint(smiles_details)\n", "\n", " # 整理反应数据\n", " try:\n", " backed_out = utils.backout_without_coref(reaction_results, coref_results, gpt_output, smiles_details, model.molscribe)\n", " backed_out.sort(key=lambda x: x[2])\n", " extracted_rxns = {}\n", " for reactants, products_, label in backed_out:\n", " extracted_rxns[label] = {'reactants': reactants, 'products': products_}\n", "\n", " toadd = {\n", " \"reaction_template\": {\n", " \"reactants\": reactants_array,\n", " \"products\": products\n", " },\n", " \"reactions\": extracted_rxns\n", " }\n", " \n", "\n", " # 按标签排序\n", " sorted_keys = sorted(toadd[\"reactions\"].keys())\n", " toadd[\"reactions\"] = {i: toadd[\"reactions\"][i] for i in sorted_keys}\n", " original_molecular_list = {'Original molecular list': gpt_output}\n", " final_data= toadd.copy()\n", " final_data.update(original_molecular_list)\n", " except:\n", " #pass\n", " final_data = {'Original molecular list': gpt_output}\n", "\n", " print(final_data)\n", " return final_data\n", " \n", "\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# # image_path = './example/Replace/99.jpg'\n", "# # result = process_reaction_image(image_path)\n", "# # print(json.dumps(result, indent=4))\n", "# image_path = './example/example1/replace/Nesting/283.jpg'\n", "# image = Image.open(image_path).convert('RGB')\n", "# image_np = np.array(image)\n", "\n", "# # input1 = get_multi_molecular_text_to_correct_withatoms('./example/example1/replace/Nesting/283.jpg')\n", "# # input2 = get_reaction('./example/example1/replace/Nesting/283.jpg')\n", "# # print(input1)\n", "# # print(input2)\n", "# #reaction_results = model.extract_reactions_from_figures([image_np])\n", "# coorf = model.extract_molecule_corefs_from_figures([image_np])\n", "# print(coorf)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import base64\n", "import torch\n", "import json\n", "from PIL import Image\n", "import numpy as np\n", "from openai import AzureOpenAI\n", "\n", "def process_reaction_image_final(image_path: str) -> dict:\n", " \"\"\"\n", "\n", " Args:\n", " image_path (str): 图像文件路径。\n", "\n", " Returns:\n", " dict: 整理后的反应数据,包括反应物、产物和反应模板。\n", " \"\"\"\n", " # 配置 API Key 和 Azure Endpoint\n", " api_key = \"b038da96509b4009be931e035435e022\" # 替换为实际的 API Key\n", " azure_endpoint = \"https://hkust.azure-api.net\" # 替换为实际的 Azure Endpoint\n", " \n", "\n", " model = ChemIEToolkit(device=torch.device('cpu'))\n", " client = AzureOpenAI(\n", " api_key=api_key,\n", " api_version='2024-06-01',\n", " azure_endpoint=azure_endpoint\n", " )\n", "\n", " # 加载图像并编码为 Base64\n", " def encode_image(image_path: str):\n", " with open(image_path, \"rb\") as image_file:\n", " return base64.b64encode(image_file.read()).decode('utf-8')\n", "\n", " base64_image = encode_image(image_path)\n", "\n", " # GPT 工具调用配置\n", " tools = [\n", " {\n", " 'type': 'function',\n", " 'function': {\n", " 'name': 'get_multi_molecular_text_to_correct',\n", " 'description': 'Extracts the SMILES string and text coref from molecular sub-images from a reaction image and ready for further process.',\n", " 'parameters': {\n", " 'type': 'object',\n", " 'properties': {\n", " 'image_path': {\n", " 'type': 'string',\n", " 'description': 'Path to the reaction image.'\n", " }\n", " },\n", " 'required': ['image_path'],\n", " 'additionalProperties': False\n", " }\n", " }\n", " },\n", " {\n", " 'type': 'function',\n", " 'function': {\n", " 'name': 'get_reaction',\n", " 'description': 'Get a list of reactions from a reaction image. A reaction contains data of the reactants, conditions, and products.',\n", " 'parameters': {\n", " 'type': 'object',\n", " 'properties': {\n", " 'image_path': {\n", " 'type': 'string',\n", " 'description': 'The path to the reaction image.',\n", " },\n", " },\n", " 'required': ['image_path'],\n", " 'additionalProperties': False,\n", " },\n", " },\n", " },\n", "\n", " \n", "\n", " {\n", " 'type': 'function',\n", " 'function': {\n", " 'name': 'process_reaction_image_with_multiple_products',\n", " 'description': 'process the reaction image that contains a multiple products table. Get a list of reactions from the reaction image, Inculding the reaction template and detailed reaction with detailed R-group information.',\n", " 'parameters': {\n", " 'type': 'object',\n", " 'properties': {\n", " 'image_path': {\n", " 'type': 'string',\n", " 'description': 'The path to the reaction image.',\n", " },\n", " },\n", " 'required': ['image_path'],\n", " 'additionalProperties': False,\n", " },\n", " },\n", " },\n", "\n", " {\n", " 'type': 'function',\n", " 'function': {\n", " 'name': 'get_full_reaction',\n", " 'description': 'Get a list of reactions from a reaction image without any tables. A reaction contains data of the reactants, conditions, and products.',\n", " 'parameters': {\n", " 'type': 'object',\n", " 'properties': {\n", " 'image_path': {\n", " 'type': 'string',\n", " 'description': 'The path to the reaction image.',\n", " },\n", " },\n", " 'required': ['image_path'],\n", " 'additionalProperties': False,\n", " },\n", " },\n", " },\n", "\n", " {\n", " 'type': 'function',\n", " 'function': {\n", " 'name': 'get_multi_molecular',\n", " 'description': 'Extracts the SMILES string and text coref from a molecular image without any reactions',\n", " 'parameters': {\n", " 'type': 'object',\n", " 'properties': {\n", " 'image_path': {\n", " 'type': 'string',\n", " 'description': 'The path to the reaction image.',\n", " },\n", " },\n", " 'required': ['image_path'],\n", " 'additionalProperties': False,\n", " },\n", " },\n", " },\n", " ]\n", "\n", " # 提供给 GPT 的消息内容\n", " with open('./prompt_final.txt', 'r') as prompt_file:\n", " prompt = prompt_file.read()\n", " messages = [\n", " {'role': 'system', 'content': 'You are a helpful assistant.'},\n", " {\n", " 'role': 'user',\n", " 'content': [\n", " {'type': 'text', 'text': prompt},\n", " {'type': 'image_url', 'image_url': {'url': f'data:image/png;base64,{base64_image}'}}\n", " ]\n", " }\n", " ]\n", "\n", " # 调用 GPT 接口\n", " response = client.chat.completions.create(\n", " model = 'gpt-4o',\n", " temperature = 0,\n", " response_format={ 'type': 'json_object' },\n", " messages = [\n", " {'role': 'system', 'content': 'You are a helpful assistant.'},\n", " {\n", " 'role': 'user',\n", " 'content': [\n", " {\n", " 'type': 'text',\n", " 'text': prompt\n", " },\n", " {\n", " 'type': 'image_url',\n", " 'image_url': {\n", " 'url': f'data:image/png;base64,{base64_image}'\n", " }\n", " }\n", " ]},\n", " ],\n", " tools = tools)\n", " \n", "# Step 1: 工具映射表\n", " TOOL_MAP = {\n", " 'get_multi_molecular_text_to_correct': get_multi_molecular_text_to_correct,\n", " 'get_reaction': get_reaction,\n", " 'process_reaction_image_with_multiple_products':process_reaction_image_with_multiple_products,\n", "\n", " 'get_full_reaction': get_full_reaction,\n", " 'get_multi_molecular':get_multi_molecular,\n", " }\n", "\n", " # Step 2: 处理多个工具调用\n", " tool_calls = response.choices[0].message.tool_calls\n", " results = []\n", "\n", " # 遍历每个工具调用\n", " for tool_call in tool_calls:\n", " tool_name = tool_call.function.name\n", " tool_arguments = tool_call.function.arguments\n", " tool_call_id = tool_call.id\n", " \n", " tool_args = json.loads(tool_arguments)\n", " \n", " if tool_name in TOOL_MAP:\n", " # 调用工具并获取结果\n", " tool_result = TOOL_MAP[tool_name](image_path)\n", " else:\n", " raise ValueError(f\"Unknown tool called: {tool_name}\")\n", " \n", " # 保存每个工具调用结果\n", " results.append({\n", " 'role': 'tool',\n", " 'content': json.dumps({\n", " 'image_path': image_path,\n", " f'{tool_name}':(tool_result),\n", " }),\n", " 'tool_call_id': tool_call_id,\n", " })\n", "\n", "\n", "# Prepare the chat completion payload\n", " completion_payload = {\n", " 'model': 'gpt-4o',\n", " 'messages': [\n", " {'role': 'system', 'content': 'You are a helpful assistant.'},\n", " {\n", " 'role': 'user',\n", " 'content': [\n", " {\n", " 'type': 'text',\n", " 'text': prompt\n", " },\n", " {\n", " 'type': 'image_url',\n", " 'image_url': {\n", " 'url': f'data:image/png;base64,{base64_image}'\n", " }\n", " }\n", " ]\n", " },\n", " response.choices[0].message,\n", " *results\n", " ],\n", " }\n", "\n", "# Generate new response\n", " response = client.chat.completions.create(\n", " model=completion_payload[\"model\"],\n", " messages=completion_payload[\"messages\"],\n", " response_format={ 'type': 'json_object' },\n", " temperature=0\n", " )\n", "\n", "\n", " \n", " # 获取 GPT 生成的结果\n", " gpt_output = json.loads(response.choices[0].message.content)\n", " print(gpt_output)\n", " return gpt_output\n", "\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "image_path = './data/bowen-4/2.png'\n", "result = process_reaction_image_final(image_path)\n", "print(json.dumps(result, indent=4))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# def get_reaction(image_path: str) -> list:\n", "# '''Returns a list of reactions extracted from the image.'''\n", "# image_file = image_path\n", "# return json.dumps(model1.predict_image_file(image_file, molscribe=True, ocr=True))\n", "\n", "# reaction_output = get_reaction('./pdf/2/2_image_3_1.png')\n", "# print(reaction_output)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os\n", "import fitz # PyMuPDF\n", "from core import run_visualheist\n", "import base64\n", "from openai import AzureOpenAI\n", "\n", "def full_pdf_extraction_pipeline_with_history(pdf_path,\n", " output_dir,\n", " api_key,\n", " azure_endpoint,\n", " model=\"gpt-4o\",\n", " model_size=\"large\"):\n", " \"\"\"\n", " Full pipeline: from PDF to GPT-annotated related text.\n", " Extracts markdown + figures + reaction data from a PDF and calls GPT-4o to annotate them.\n", "\n", " Args:\n", " pdf_path (str): Path to input PDF file.\n", " output_dir (str): Directory to save results.\n", " api_key (str): Azure OpenAI API key.\n", " azure_endpoint (str): Azure OpenAI endpoint.\n", " model (str): GPT model name (default \"gpt-4o\").\n", " model_size (str): VisualHeist model size (\"base\", \"large\", etc).\n", "\n", " Returns:\n", " List of GPT-generated annotated related-text JSONs.\n", " \"\"\"\n", "\n", "\n", " os.makedirs(output_dir, exist_ok=True)\n", "\n", " # Step 1: Extract Markdown text\n", " doc = fitz.open(pdf_path)\n", " md_text = \"\"\n", " for i, page in enumerate(doc, start=1):\n", " md_text += f\"\\n\\n## = Page {i} =\\n\\n\" + page.get_text()\n", " filename = os.path.splitext(os.path.basename(pdf_path))[0]\n", " md_path = os.path.join(output_dir, f\"{filename}.md\")\n", " with open(md_path, \"w\", encoding=\"utf-8\") as f:\n", " f.write(md_text.strip())\n", " print(f\"[✓] Markdown saved to: {md_path}\")\n", "\n", " # Step 2: Extract figures using VisualHeist\n", " run_visualheist(pdf_dir=pdf_path, model_size=model_size, image_dir=output_dir)\n", " print(f\"[✓] Figures extracted to: {output_dir}\")\n", "\n", " # Step 3: Parse figures to JSON\n", " image_data = []\n", " known_molecules = []\n", "\n", " for fname in sorted(os.listdir(output_dir)):\n", " if fname.endswith(\".png\"):\n", " img_path = os.path.join(output_dir, fname)\n", " try:\n", " result = process_reaction_image_final(img_path)\n", " result[\"image_name\"] = fname\n", " image_data.append(result)\n", " except Exception as e:\n", " print(f\"[!] Failed on {fname}: {e}\")\n", " new_mols_json = get_multi_molecular_text_to_correct(img_path)\n", " new_mols = json.loads(new_mols_json)\n", " for m in new_mols:\n", " if m[\"smiles\"] not in {km[\"smiles\"] for km in known_molecules}:\n", " known_molecules.append(m)\n", "\n", "\n", " json_path = os.path.join(output_dir, f\"{filename}_reaction_data.json\")\n", " with open(json_path, \"w\", encoding=\"utf-8\") as f:\n", " json.dump(image_data, f, indent=2, ensure_ascii=False)\n", " print(f\"[✓] Reaction data saved to: {json_path}\")\n", "\n", " # Step 4: Call Azure GPT-4 for annotation\n", " client = AzureOpenAI(\n", " api_key=api_key,\n", " api_version=\"2024-06-01\",\n", " azure_endpoint=azure_endpoint\n", " )\n", "\n", " prompt = \"\"\"\n", "You are a text-mining assistant for chemistry papers. Your task is to find the most relevant 1–3 sentences in a research article that describe a given figure or scheme.\n", "\n", "You will be given:\n", "- A block of text extracted from the article (in Markdown format).\n", "- The extracted structured data from one image (including its title and list of molecules or reactions).\n", "\n", "Your task is:\n", "1. Match the image with sentences that are most relevant to it. Use clues like the figure/scheme/table number in the title, or molecule/reaction labels (e.g., 1a, 2b, 3).\n", "2. Extract up to 3 short sentences that best describe or mention the contents of the image.\n", "3. In these sentences, label any molecule or reaction identifiers (like “1a”, “2b”) with their role based on context: [reactant], [product], etc.\n", "4. Also label experimental conditions with their roles:\n", " - Percent values like “85%” as [yield]\n", " - Temperatures like “100 °C” as [temperature]\n", " - Time durations like “24 h”, “20 min” as [time]\n", "5. Do **not** label chemical position numbers (e.g., in \"3-trifluoromethyl\", \"1,2,4-triazole\").\n", "6. Do not repeat any labels. Only label each item once per sentence.\n", "\n", "Output format:\n", "{\n", " \"title\": \"