In [None]:
import sys
import torch
import json
from chemietoolkit import ChemIEToolkit
import cv2
from PIL import Image
import json
model = ChemIEToolkit(device=torch.device('cpu')) 
from get_molecular_agent import process_reaction_image_with_multiple_products_and_text
from get_reaction_agent import get_reaction_withatoms
from get_reaction_agent import get_full_reaction


# 定义函数，接受多个图像路径并返回反应列表
def get_multi_molecular(image_path: str) -> list:
    '''Returns a list of reactions extracted from the image.'''
    # 打开图像文件
    image = Image.open(image_path).convert('RGB')
    
    # 将图像作为输入传递给模型
    coref_results = model.extract_molecule_corefs_from_figures([image])
    
    for item in coref_results:
        for bbox in item.get("bboxes", []):
            for key in ["category", "molfile", "symbols", 'atoms', "bonds", 'category_id', 'score', 'corefs',"coords","edges"]: #'atoms'
                bbox.pop(key, None)  # 安全地移除键
    print(json.dumps(coref_results))
    # 返回反应列表，使用 json.dumps 进行格式化
    
    return json.dumps(coref_results)

def get_multi_molecular_text_to_correct(image_path: str) -> list:
    '''Returns a list of reactions extracted from the image.'''
    # 打开图像文件
    image = Image.open(image_path).convert('RGB')
    
    # 将图像作为输入传递给模型
    coref_results = model.extract_molecule_corefs_from_figures([image])
    #coref_results = process_reaction_image_with_multiple_products_and_text(image_path)
    for item in coref_results:
        for bbox in item.get("bboxes", []):
            for key in ["category", "bbox", "molfile", "symbols", 'atoms', "bonds", 'category_id', 'score', 'corefs',"coords","edges"]: #'atoms'
                bbox.pop(key, None)  # 安全地移除键
    print(json.dumps(coref_results))
    # 返回反应列表，使用 json.dumps 进行格式化
    
    return json.dumps(coref_results)

def get_multi_molecular_text_to_correct_withatoms(image_path: str) -> list:
    '''Returns a list of reactions extracted from the image.'''
    # 打开图像文件
    image = Image.open(image_path).convert('RGB')
    
    # 将图像作为输入传递给模型
    #coref_results = model.extract_molecule_corefs_from_figures([image])
    coref_results = process_reaction_image_with_multiple_products_and_text(image_path)
    for item in coref_results:
        for bbox in item.get("bboxes", []):
            for key in ["molfile", 'atoms', "bonds", 'category_id', 'score', 'corefs',"coords","edges"]: #'atoms'
                bbox.pop(key, None)  # 安全地移除键
    print(json.dumps(coref_results))
    # 返回反应列表，使用 json.dumps 进行格式化
    return json.dumps(coref_results)

#get_multi_molecular_text_to_correct('./acs.joc.2c00176 example 1.png')

import sys
#sys.path.append('./RxnScribe-main/')
import torch
from rxnscribe import RxnScribe
import json

ckpt_path = "./pix2seq_reaction_full.ckpt"
model1 = RxnScribe(ckpt_path, device=torch.device('cpu'))
device = torch.device('cpu')

def get_reaction(image_path: str) -> dict:
    '''
    Returns a structured dictionary of reactions extracted from the image,
    including reactants, conditions, and products, with their smiles, text, and bbox.
    '''
    image_file = image_path
    #raw_prediction = model1.predict_image_file(image_file, molscribe=True, ocr=True)
    raw_prediction = get_reaction_withatoms(image_path)

    # Ensure raw_prediction is treated as a list directly
    structured_output = {}
    for section_key in ['reactants', 'conditions', 'products']:
        if section_key in raw_prediction[0]:
            structured_output[section_key] = []
            for item in raw_prediction[0][section_key]:
                if section_key in ['reactants', 'products']:
                    # Extract smiles and bbox for molecules
                    structured_output[section_key].append({
                        "smiles": item.get("smiles", ""),
                        "bbox": item.get("bbox", [])
                    })
                elif section_key == 'conditions':
                    # Extract smiles, text, and bbox for conditions
                    condition_data = {"bbox": item.get("bbox", [])}
                    if "smiles" in item:
                        condition_data["smiles"] = item.get("smiles", "")
                    if "text" in item:
                        condition_data["text"] = item.get("text", [])
                    structured_output[section_key].append(condition_data)
    print(f"structured_output:{structured_output}")

    return structured_output




import base64
import torch
import json
from PIL import Image
import numpy as np
from chemietoolkit import ChemIEToolkit, utils
from openai import AzureOpenAI

def process_reaction_image_with_multiple_products(image_path: str) -> dict:
    """
    Args:
        image_path (str): 图像文件路径。

    Returns:
        dict: 整理后的反应数据，包括反应物、产物和反应模板。
    """
    # 配置 API Key 和 Azure Endpoint
    api_key = "b038da96509b4009be931e035435e022"  # 替换为实际的 API Key
    azure_endpoint = "https://hkust.azure-api.net"  # 替换为实际的 Azure Endpoint
    

    model = ChemIEToolkit(device=torch.device('cpu'))
    client = AzureOpenAI(
        api_key=api_key,
        api_version='2024-06-01',
        azure_endpoint=azure_endpoint
    )

    # 加载图像并编码为 Base64
    def encode_image(image_path: str):
        with open(image_path, "rb") as image_file:
            return base64.b64encode(image_file.read()).decode('utf-8')

    base64_image = encode_image(image_path)

    # GPT 工具调用配置
    tools = [
        {
            'type': 'function',
            'function': {
                'name': 'get_multi_molecular_text_to_correct',
                'description': 'Extracts the SMILES string and text coref from molecular images.',
                'parameters': {
                    'type': 'object',
                    'properties': {
                        'image_path': {
                            'type': 'string',
                            'description': 'Path to the reaction image.'
                        }
                    },
                    'required': ['image_path'],
                    'additionalProperties': False
                }
            }
        },
        {
        'type': 'function',
        'function': {
            'name': 'get_reaction',
            'description': 'Get a list of reactions from a reaction image. A reaction contains data of the reactants, conditions, and products.',
            'parameters': {
                'type': 'object',
                'properties': {
                    'image_path': {
                        'type': 'string',
                        'description': 'The path to the reaction image.',
                    },
                },
                'required': ['image_path'],
                'additionalProperties': False,
            },
        },
            },
    ]

    # 提供给 GPT 的消息内容
    with open('./prompt.txt', 'r') as prompt_file:
        prompt = prompt_file.read()
    messages = [
        {'role': 'system', 'content': 'You are a helpful assistant.'},
        {
            'role': 'user',
            'content': [
                {'type': 'text', 'text': prompt},
                {'type': 'image_url', 'image_url': {'url': f'data:image/png;base64,{base64_image}'}}
            ]
        }
    ]

    # 调用 GPT 接口
    response = client.chat.completions.create(
    model = 'gpt-4o',
    temperature = 0,
    response_format={ 'type': 'json_object' },
    messages = [
        {'role': 'system', 'content': 'You are a helpful assistant.'},
        {
            'role': 'user',
            'content': [
                {
                    'type': 'text',
                    'text': prompt
                },
                {
                    'type': 'image_url',
                    'image_url': {
                        'url': f'data:image/png;base64,{base64_image}'
                    }
                }
            ]},
    ],
    tools = tools)
    
# Step 1: 工具映射表
    TOOL_MAP = {
        'get_multi_molecular_text_to_correct': get_multi_molecular_text_to_correct,
        'get_reaction': get_reaction
    }

    # Step 2: 处理多个工具调用
    tool_calls = response.choices[0].message.tool_calls
    results = []

    # 遍历每个工具调用
    for tool_call in tool_calls:
        tool_name = tool_call.function.name
        tool_arguments = tool_call.function.arguments
        tool_call_id = tool_call.id
        
        tool_args = json.loads(tool_arguments)
        
        if tool_name in TOOL_MAP:
            # 调用工具并获取结果
            tool_result = TOOL_MAP[tool_name](image_path)
        else:
            raise ValueError(f"Unknown tool called: {tool_name}")
        
        # 保存每个工具调用结果
        results.append({
            'role': 'tool',
            'content': json.dumps({
                'image_path': image_path,
                f'{tool_name}':(tool_result),
            }),
            'tool_call_id': tool_call_id,
        })


# Prepare the chat completion payload
    completion_payload = {
        'model': 'gpt-4o',
        'messages': [
            {'role': 'system', 'content': 'You are a helpful assistant.'},
            {
                'role': 'user',
                'content': [
                    {
                        'type': 'text',
                        'text': prompt
                    },
                    {
                        'type': 'image_url',
                        'image_url': {
                            'url': f'data:image/png;base64,{base64_image}'
                        }
                    }
                ]
            },
            response.choices[0].message,
            *results
            ],
    }

# Generate new response
    response = client.chat.completions.create(
        model=completion_payload["model"],
        messages=completion_payload["messages"],
        response_format={ 'type': 'json_object' },
        temperature=0
    )


    
    # 获取 GPT 生成的结果
    gpt_output = json.loads(response.choices[0].message.content)
    print(f"gptout:{gpt_output}")

    image = Image.open(image_path).convert('RGB')
    image_np = np.array(image)

    #########################
    #reaction_results = model.extract_reactions_from_figures([image_np])
    reaction_results = get_reaction_withatoms(image_path)[0]
    reactions = []
    
    # 将 reactants 和 products 转换为 reactions
    for reactants, conditions, products in zip(reaction_results.get('reactants', []), reaction_results.get('conditions', []), reaction_results.get('products', [])):
        reaction = {
            "reactants": [reactants],
            "conditions": [conditions],
            "products": [products]
        }
        reactions.append(reaction)
    reaction_results = [{"reactions": reactions}]
    #coref_results = model.extract_molecule_corefs_from_figures([image_np])
    coref_results = process_reaction_image_with_multiple_products_and_text(image_path)
    ########################

    # 定义更新工具输出的函数
    def extract_smiles_details(smiles_data, raw_details):
        smiles_details = {}
        for smiles in smiles_data:
            for detail in raw_details:
                for bbox in detail.get('bboxes', []):
                    if bbox.get('smiles') == smiles:
                        smiles_details[smiles] = {
                            'category': bbox.get('category'),
                            'bbox': bbox.get('bbox'),
                            'category_id': bbox.get('category_id'),
                            'score': bbox.get('score'),
                            'molfile': bbox.get('molfile'),
                            'atoms': bbox.get('atoms'),
                            'bonds': bbox.get('bonds')
                        }
                        break
        return smiles_details

# 获取结果
    smiles_details = extract_smiles_details(gpt_output, coref_results)

    reactants_array = []
    products = []

    for reactant in reaction_results[0]['reactions'][0]['reactants']:
    #for reactant in reaction_results[0]['reactions'][0]['reactants']:
        if 'smiles' in reactant:
            #print(reactant['smiles'])
            #print(reactant)
            reactants_array.append(reactant['smiles'])

    for product in reaction_results[0]['reactions'][0]['products']:
        #print(product['smiles'])
        #print(product)
        products.append(product['smiles'])
    # 输出结果
    #import pprint
    #pprint.pprint(smiles_details)

        # 整理反应数据
    try:
        backed_out = utils.backout_without_coref(reaction_results, coref_results, gpt_output, smiles_details, model.molscribe)
        backed_out.sort(key=lambda x: x[2])
        extracted_rxns = {}
        for reactants, products_, label in backed_out:
            extracted_rxns[label] = {'reactants': reactants, 'products': products_}

        toadd = {
            "reaction_template": {
                "reactants": reactants_array,
                "products": products
            },
            "reactions": extracted_rxns
        }
        

    # 按标签排序
        sorted_keys = sorted(toadd["reactions"].keys())
        toadd["reactions"] = {i: toadd["reactions"][i] for i in sorted_keys}
        original_molecular_list = {'Original molecular list': gpt_output}
        final_data= toadd.copy()
        final_data.update(original_molecular_list)
    except:
        #pass
        final_data = {'Original molecular list': gpt_output}

    print(final_data)
    return final_data
 





In [None]:
# # image_path = './example/Replace/99.jpg'
# # result = process_reaction_image(image_path)
# # print(json.dumps(result, indent=4))
# image_path = './example/example1/replace/Nesting/283.jpg'
# image = Image.open(image_path).convert('RGB')
# image_np = np.array(image)

# # input1 = get_multi_molecular_text_to_correct_withatoms('./example/example1/replace/Nesting/283.jpg')
# # input2 = get_reaction('./example/example1/replace/Nesting/283.jpg')
# # print(input1)
# # print(input2)
# #reaction_results = model.extract_reactions_from_figures([image_np])
# coorf = model.extract_molecule_corefs_from_figures([image_np])
# print(coorf)


In [None]:
import base64
import torch
import json
from PIL import Image
import numpy as np
from openai import AzureOpenAI

def process_reaction_image_final(image_path: str) -> dict:
    """

    Args:
        image_path (str): 图像文件路径。

    Returns:
        dict: 整理后的反应数据，包括反应物、产物和反应模板。
    """
    # 配置 API Key 和 Azure Endpoint
    api_key = "b038da96509b4009be931e035435e022"  # 替换为实际的 API Key
    azure_endpoint = "https://hkust.azure-api.net"  # 替换为实际的 Azure Endpoint
    

    model = ChemIEToolkit(device=torch.device('cpu'))
    client = AzureOpenAI(
        api_key=api_key,
        api_version='2024-06-01',
        azure_endpoint=azure_endpoint
    )

    # 加载图像并编码为 Base64
    def encode_image(image_path: str):
        with open(image_path, "rb") as image_file:
            return base64.b64encode(image_file.read()).decode('utf-8')

    base64_image = encode_image(image_path)

    # GPT 工具调用配置
    tools = [
        {
            'type': 'function',
            'function': {
                'name': 'get_multi_molecular_text_to_correct',
                'description': 'Extracts the SMILES string and text coref from molecular sub-images from a reaction image and ready for further process.',
                'parameters': {
                    'type': 'object',
                    'properties': {
                        'image_path': {
                            'type': 'string',
                            'description': 'Path to the reaction image.'
                        }
                    },
                    'required': ['image_path'],
                    'additionalProperties': False
                }
            }
        },
        {
        'type': 'function',
        'function': {
            'name': 'get_reaction',
            'description': 'Get a list of reactions from a reaction image. A reaction contains data of the reactants, conditions, and products.',
            'parameters': {
                'type': 'object',
                'properties': {
                    'image_path': {
                        'type': 'string',
                        'description': 'The path to the reaction image.',
                    },
                },
                'required': ['image_path'],
                'additionalProperties': False,
            },
        },
            },

        

            {
        'type': 'function',
        'function': {
            'name': 'process_reaction_image_with_multiple_products',
            'description': 'process the reaction image that contains a multiple products table. Get a list of reactions from the reaction image, Inculding the reaction template and detailed reaction with detailed R-group information.',
            'parameters': {
                'type': 'object',
                'properties': {
                    'image_path': {
                        'type': 'string',
                        'description': 'The path to the reaction image.',
                    },
                },
                'required': ['image_path'],
                'additionalProperties': False,
            },
        },
            },

            {
        'type': 'function',
        'function': {
            'name': 'get_full_reaction',
            'description': 'Get a list of reactions from a reaction image without any tables. A reaction contains data of the reactants, conditions, and products.',
            'parameters': {
                'type': 'object',
                'properties': {
                    'image_path': {
                        'type': 'string',
                        'description': 'The path to the reaction image.',
                    },
                },
                'required': ['image_path'],
                'additionalProperties': False,
            },
        },
            },

        {
        'type': 'function',
        'function': {
            'name': 'get_multi_molecular',
            'description': 'Extracts the SMILES string and text coref from a molecular image without any reactions',
            'parameters': {
                'type': 'object',
                'properties': {
                    'image_path': {
                        'type': 'string',
                        'description': 'The path to the reaction image.',
                    },
                },
                'required': ['image_path'],
                'additionalProperties': False,
            },
        },
            },
    ]

    # 提供给 GPT 的消息内容
    with open('./prompt_final.txt', 'r') as prompt_file:
        prompt = prompt_file.read()
    messages = [
        {'role': 'system', 'content': 'You are a helpful assistant.'},
        {
            'role': 'user',
            'content': [
                {'type': 'text', 'text': prompt},
                {'type': 'image_url', 'image_url': {'url': f'data:image/png;base64,{base64_image}'}}
            ]
        }
    ]

    # 调用 GPT 接口
    response = client.chat.completions.create(
    model = 'gpt-4o',
    temperature = 0,
    response_format={ 'type': 'json_object' },
    messages = [
        {'role': 'system', 'content': 'You are a helpful assistant.'},
        {
            'role': 'user',
            'content': [
                {
                    'type': 'text',
                    'text': prompt
                },
                {
                    'type': 'image_url',
                    'image_url': {
                        'url': f'data:image/png;base64,{base64_image}'
                    }
                }
            ]},
    ],
    tools = tools)
    
# Step 1: 工具映射表
    TOOL_MAP = {
        'get_multi_molecular_text_to_correct': get_multi_molecular_text_to_correct,
        'get_reaction': get_reaction,
        'process_reaction_image_with_multiple_products':process_reaction_image_with_multiple_products,

        'get_full_reaction': get_full_reaction,
        'get_multi_molecular':get_multi_molecular,
    }

    # Step 2: 处理多个工具调用
    tool_calls = response.choices[0].message.tool_calls
    results = []

    # 遍历每个工具调用
    for tool_call in tool_calls:
        tool_name = tool_call.function.name
        tool_arguments = tool_call.function.arguments
        tool_call_id = tool_call.id
        
        tool_args = json.loads(tool_arguments)
        
        if tool_name in TOOL_MAP:
            # 调用工具并获取结果
            tool_result = TOOL_MAP[tool_name](image_path)
        else:
            raise ValueError(f"Unknown tool called: {tool_name}")
        
        # 保存每个工具调用结果
        results.append({
            'role': 'tool',
            'content': json.dumps({
                'image_path': image_path,
                f'{tool_name}':(tool_result),
            }),
            'tool_call_id': tool_call_id,
        })


# Prepare the chat completion payload
    completion_payload = {
        'model': 'gpt-4o',
        'messages': [
            {'role': 'system', 'content': 'You are a helpful assistant.'},
            {
                'role': 'user',
                'content': [
                    {
                        'type': 'text',
                        'text': prompt
                    },
                    {
                        'type': 'image_url',
                        'image_url': {
                            'url': f'data:image/png;base64,{base64_image}'
                        }
                    }
                ]
            },
            response.choices[0].message,
            *results
            ],
    }

# Generate new response
    response = client.chat.completions.create(
        model=completion_payload["model"],
        messages=completion_payload["messages"],
        response_format={ 'type': 'json_object' },
        temperature=0
    )


    
    # 获取 GPT 生成的结果
    gpt_output = json.loads(response.choices[0].message.content)
    print(gpt_output)
    return gpt_output





In [None]:
image_path = './data/bowen-4/2.png'
result = process_reaction_image_final(image_path)
print(json.dumps(result, indent=4))

In [None]:
# def get_reaction(image_path: str) -> list:
#     '''Returns a list of reactions extracted from the image.'''
#     image_file = image_path
#     return json.dumps(model1.predict_image_file(image_file, molscribe=True, ocr=True))

# reaction_output = get_reaction('./pdf/2/2_image_3_1.png')
# print(reaction_output)

In [None]:
import os
import fitz  # PyMuPDF
from core import run_visualheist
import base64
from openai import AzureOpenAI

def full_pdf_extraction_pipeline_with_history(pdf_path,
                                  output_dir,
                                  api_key,
                                  azure_endpoint,
                                  model="gpt-4o",
                                  model_size="large"):
    """
    Full pipeline: from PDF to GPT-annotated related text.
    Extracts markdown + figures + reaction data from a PDF and calls GPT-4o to annotate them.

    Args:
        pdf_path (str): Path to input PDF file.
        output_dir (str): Directory to save results.
        api_key (str): Azure OpenAI API key.
        azure_endpoint (str): Azure OpenAI endpoint.
        model (str): GPT model name (default "gpt-4o").
        model_size (str): VisualHeist model size ("base", "large", etc).

    Returns:
        List of GPT-generated annotated related-text JSONs.
    """


    os.makedirs(output_dir, exist_ok=True)

    # Step 1: Extract Markdown text
    doc = fitz.open(pdf_path)
    md_text = ""
    for i, page in enumerate(doc, start=1):
        md_text += f"\n\n## = Page {i} =\n\n" + page.get_text()
    filename = os.path.splitext(os.path.basename(pdf_path))[0]
    md_path = os.path.join(output_dir, f"{filename}.md")
    with open(md_path, "w", encoding="utf-8") as f:
        f.write(md_text.strip())
    print(f"[✓] Markdown saved to: {md_path}")

    # Step 2: Extract figures using VisualHeist
    run_visualheist(pdf_dir=pdf_path, model_size=model_size, image_dir=output_dir)
    print(f"[✓] Figures extracted to: {output_dir}")

    # Step 3: Parse figures to JSON
    image_data = []
    known_molecules = []

    for fname in sorted(os.listdir(output_dir)):
        if fname.endswith(".png"):
            img_path = os.path.join(output_dir, fname)
            try:
                result = process_reaction_image_final(img_path)
                result["image_name"] = fname
                image_data.append(result)
            except Exception as e:
                print(f"[!] Failed on {fname}: {e}")
                new_mols_json = get_multi_molecular_text_to_correct(img_path)
                new_mols = json.loads(new_mols_json)
                for m in new_mols:
                    if m["smiles"] not in {km["smiles"] for km in known_molecules}:
                        known_molecules.append(m)


    json_path = os.path.join(output_dir, f"{filename}_reaction_data.json")
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(image_data, f, indent=2, ensure_ascii=False)
    print(f"[✓] Reaction data saved to: {json_path}")

    # Step 4: Call Azure GPT-4 for annotation
    client = AzureOpenAI(
        api_key=api_key,
        api_version="2024-06-01",
        azure_endpoint=azure_endpoint
    )

    prompt = """
You are a text-mining assistant for chemistry papers. Your task is to find the most relevant 1–3 sentences in a research article that describe a given figure or scheme.

You will be given:
- A block of text extracted from the article (in Markdown format).
- The extracted structured data from one image (including its title and list of molecules or reactions).

Your task is:
1. Match the image with sentences that are most relevant to it. Use clues like the figure/scheme/table number in the title, or molecule/reaction labels (e.g., 1a, 2b, 3).
2. Extract up to 3 short sentences that best describe or mention the contents of the image.
3. In these sentences, label any molecule or reaction identifiers (like “1a”, “2b”) with their role based on context: [reactant], [product], etc.
4. Also label experimental conditions with their roles:
   - Percent values like “85%” as [yield]
   - Temperatures like “100 °C” as [temperature]
   - Time durations like “24 h”, “20 min” as [time]
5. Do **not** label chemical position numbers (e.g., in "3-trifluoromethyl", "1,2,4-triazole").
6. Do not repeat any labels. Only label each item once per sentence.

Output format:
{
  "title": "<title from image>",
  "related-text": [
    "Sentence with roles like 1a[reactant], 2c[product], 100[temperature] °C.",
    ...
  ]
}
"""

    annotated_results = []
    for item in image_data:
        img_path = os.path.join(output_dir, item["image_name"])
        with open(img_path, "rb") as f:
            base64_image = base64.b64encode(f.read()).decode("utf-8")

        combined_input = f"""
## Image Structured Data:
{json.dumps(item, indent=2)}

## Article Text:
{md_text}
"""

        response = client.chat.completions.create(
            model=model,
            temperature=0,
            response_format="json",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": prompt + "\n\n" + combined_input},
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/png;base64,{base64_image}"
                            }
                        }
                    ]
                }
            ]
        )
        annotated_results.append(json.loads(response.choices[0].message.content))

    # Optionally save output
    with open(os.path.join(output_dir, f"{filename}_annotated_related_text.json"), "w", encoding="utf-8") as f:
        json.dump(annotated_results, f, indent=2, ensure_ascii=False)
    print(f"[✓] Annotated related-text saved.")

    return annotated_results

In [None]:
image_path = './data/example/example1/replace/Nesting/283.jpg'
#image_path = './pdf/2/2_image_1_1.png'
result = process_reaction_image_final(image_path)
print(json.dumps(result, indent=4))

In [None]:
# import os

# image_folder = './example/example1/replace/regular/'  # 图片文件夹路径
# output_folder = './batches_final_repalce_regular/'  # 保存每批结果的文件夹路径
# batch_size = 3  # 每批处理文件数量

# # 创建保存批次结果的文件夹（如果不存在）
# os.makedirs(output_folder, exist_ok=True)

# # 获取所有图片文件并按字母顺序排序
# all_files = sorted([f for f in os.listdir(image_folder) if f.endswith('.jpg')])

# # 获取已完成的批次
# completed_batches = [
#     int(f.split('_')[1].split('.')[0]) for f in os.listdir(output_folder) if f.startswith('batch_') and f.endswith('.json')
# ]
# completed_batches = sorted(completed_batches)  # 确保按顺序排序

# # 从指定批次开始（如果有未完成批次）
# start_batch = (completed_batches[-1] + 1) if completed_batches else 1

# # 将文件分批并从指定批次开始
# for batch_index in range((start_batch - 1) * batch_size, len(all_files), batch_size):
#     batch_files = all_files[batch_index:batch_index + batch_size]
#     results = []

#     batch_number = batch_index // batch_size + 1
#     print(f"正在按字母顺序处理第 {batch_number} 批，共 {len(batch_files)} 张图片...")
    
#     for file_name in batch_files:
#         image_path = os.path.join(image_folder, file_name)
#         print(f"处理文件 {file_name}...")
        
#         try:
#             # 处理单个图片
#             result = process_reaction_image_final(image_path)
            
#             # 确保结果是字典
#             if isinstance(result, dict):
#                 # 添加文件名信息
#                 result_with_filename = {
#                     "file_name": file_name,
#                     **result
#                 }
#                 results.append(result_with_filename)
#                 print(result_with_filename)
#             else:
#                 print(f"文件 {file_name} 的处理结果不是字典，跳过。")
        
#         except Exception as e:
#             print(f"处理文件 {file_name} 时出错: {e}")

#     # 保存当前批次结果
#     batch_output_path = os.path.join(output_folder, f'batch_{batch_number}.json')
#     with open(batch_output_path, 'w', encoding='utf-8') as json_file:
#         json.dump(results, json_file, ensure_ascii=False, indent=4)

#     print(f"第 {batch_number} 批处理完成，结果保存到 {batch_output_path}")

# print("所有批次处理完成！")




In [None]:
import rdkit
from rdkit import Chem
from rdkit.Chem import Draw

Draw.MolToImage(Chem.MolFromSmiles('[Si](C)(C)OC(c1ccccc1)(c1ccccc1)C1CCC2=NN(Cc3ccccc3)=CN21'))