Spaces:

CYF200127
/

ChemEagle_API

Sleeping

App Files Files Community

CYF200127 commited on May 17

Commit

1f516b6

verified ·

1 Parent(s): 768c438

Upload 162 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +5 -0
__init__.py +3 -0
__pycache__/get_molecular_agent.cpython-310.pyc +0 -0
__pycache__/get_reaction_agent.cpython-310.pyc +0 -0
__pycache__/main.cpython-310.pyc +0 -0
app.ipynb +295 -0
app.py +239 -0
chemiener/__init__.py +1 -0
chemiener/__pycache__/__init__.cpython-310.pyc +0 -0
chemiener/__pycache__/__init__.cpython-38.pyc +0 -0
chemiener/__pycache__/dataset.cpython-310.pyc +0 -0
chemiener/__pycache__/dataset.cpython-38.pyc +0 -0
chemiener/__pycache__/interface.cpython-310.pyc +0 -0
chemiener/__pycache__/interface.cpython-38.pyc +0 -0
chemiener/__pycache__/model.cpython-310.pyc +0 -0
chemiener/__pycache__/model.cpython-38.pyc +0 -0
chemiener/__pycache__/utils.cpython-310.pyc +0 -0
chemiener/__pycache__/utils.cpython-38.pyc +0 -0
chemiener/dataset.py +172 -0
chemiener/interface.py +124 -0
chemiener/main.py +345 -0
chemiener/model.py +14 -0
chemiener/utils.py +23 -0
chemietoolkit/__init__.py +1 -0
chemietoolkit/__pycache__/__init__.cpython-310.pyc +0 -0
chemietoolkit/__pycache__/__init__.cpython-38.pyc +0 -0
chemietoolkit/__pycache__/chemrxnextractor.cpython-310.pyc +0 -0
chemietoolkit/__pycache__/chemrxnextractor.cpython-38.pyc +0 -0
chemietoolkit/__pycache__/interface.cpython-310.pyc +0 -0
chemietoolkit/__pycache__/interface.cpython-38.pyc +0 -0
chemietoolkit/__pycache__/tableextractor.cpython-310.pyc +0 -0
chemietoolkit/__pycache__/utils.cpython-310.pyc +0 -0
chemietoolkit/chemrxnextractor.py +107 -0
chemietoolkit/interface.py +749 -0
chemietoolkit/tableextractor.py +340 -0
chemietoolkit/utils.py +1018 -0
examples/exp.png +3 -0
examples/image.webp +0 -0
examples/rdkit.png +0 -0
examples/reaction1.jpg +0 -0
examples/reaction2.png +0 -0
examples/reaction3.png +0 -0
examples/reaction4.png +3 -0
get_molecular_agent.py +599 -0
get_reaction_agent.py +507 -0
main.py +546 -0
main_Rgroup_debug.ipynb +993 -0
molscribe/__init__.py +1 -0
molscribe/__pycache__/__init__.cpython-310.pyc +0 -0
molscribe/__pycache__/augment.cpython-310.pyc +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+examples/exp.png filter=lfs diff=lfs merge=lfs -text
+examples/reaction4.png filter=lfs diff=lfs merge=lfs -text
+molscribe/indigo/lib/Linux/x64/libbingo.so filter=lfs diff=lfs merge=lfs -text
+molscribe/indigo/lib/Linux/x64/libindigo-renderer.so filter=lfs diff=lfs merge=lfs -text
+molscribe/indigo/lib/Linux/x64/libindigo.so filter=lfs diff=lfs merge=lfs -text

__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+__version__ = "0.1.0"
+__author__ = 'Alex Wang'
+__credits__ = 'CSAIL'

__pycache__/get_molecular_agent.cpython-310.pyc ADDED Viewed

Binary file (9.08 kB). View file

__pycache__/get_reaction_agent.cpython-310.pyc ADDED Viewed

Binary file (6.94 kB). View file

__pycache__/main.cpython-310.pyc ADDED Viewed

Binary file (8.97 kB). View file

app.ipynb ADDED Viewed

	@@ -0,0 +1,295 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d13d3631",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "* Running on local URL:  http://127.0.0.1:7866\n",
+      "\n",
+      "To create a public link, set `share=True` in `launch()`.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div><iframe src=\"http://127.0.0.1:7866/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import gradio as gr\n",
+    "import json\n",
+    "from main import ChemEagle  # 支持 API key 通过环境变量\n",
+    "from rdkit import Chem\n",
+    "from rdkit.Chem import rdChemReactions\n",
+    "from rdkit.Chem import Draw\n",
+    "from rdkit.Chem import AllChem\n",
+    "from rdkit.Chem.Draw import rdMolDraw2D\n",
+    "import cairosvg\n",
+    "import re\n",
+    "import torch\n",
+    "\n",
+    "example_diagram = \"examples/exp.png\"\n",
+    "rdkit_image = \"examples/rdkit.png\"\n",
+    "# 解析 ChemEagle 返回的结构化数据\n",
+    "def parse_reactions(output_json):\n",
+    "    \"\"\"\n",
+    "    解析 JSON 格式的反应数据并格式化输出，包含颜色定制。\n",
+    "    \"\"\"\n",
+    "    if isinstance(output_json, str):\n",
+    "        reactions_data = json.loads(output_json)\n",
+    "    elif isinstance(output_json, dict):\n",
+    "        reactions_data = output_json  # 转换 JSON 字符串为字典\n",
+    "    reactions_list = reactions_data.get(\"reactions\", [])\n",
+    "    detailed_output = []\n",
+    "    smiles_output = [] \n",
+    "\n",
+    "    for reaction in reactions_list:\n",
+    "        reaction_id = reaction.get(\"reaction_id\", \"Unknown ID\")\n",
+    "        reactants = [r.get(\"smiles\", \"Unknown\") for r in reaction.get(\"reactants\", [])]\n",
+    "        conditions = [\n",
+    "            f\"<span style='color:red'>{c.get('smiles', c.get('text', 'Unknown'))}[{c.get('role', 'Unknown')}]</span>\"\n",
+    "            for c in reaction.get(\"condition\", [])\n",
+    "        ]\n",
+    "        conditions_1 = [\n",
+    "            f\"<span style='color:black'>{c.get('smiles', c.get('text', 'Unknown'))}[{c.get('role', 'Unknown')}]</span>\"\n",
+    "            for c in reaction.get(\"condition\", [])\n",
+    "        ]\n",
+    "        products = [f\"<span style='color:orange'>{p.get('smiles', 'Unknown')}</span>\" for p in reaction.get(\"products\", [])]\n",
+    "        products_1 = [f\"<span style='color:black'>{p.get('smiles', 'Unknown')}</span>\" for p in reaction.get(\"products\", [])]\n",
+    "        products_2 = [r.get(\"smiles\", \"Unknown\") for r in reaction.get(\"products\", [])]\n",
+    "        \n",
+    "        additional = reaction.get(\"additional_info\", [])\n",
+    "        additional_str = [str(x) for x in additional if x is not None]\n",
+    "\n",
+    "        tail = conditions_1 + additional_str\n",
+    "        tail_str = \", \".join(tail)\n",
+    "\n",
+    "        # 构造反应的完整字符串，定制字体颜色\n",
+    "        full_reaction = f\"{'.'.join(reactants)}>>{'.'.join(products_1)} | {tail_str}\"\n",
+    "        full_reaction = f\"<span style='color:black'>{full_reaction}</span>\"\n",
+    "        \n",
+    "        # 详细反应格式化输出\n",
+    "        reaction_output = f\"<b>Reaction: </b> {reaction_id}<br>\"\n",
+    "        reaction_output += f\"  Reactants: <span style='color:blue'>{', '.join(reactants)}</span><br>\"\n",
+    "        reaction_output += f\"  Conditions: {', '.join(conditions)}<br>\"\n",
+    "        reaction_output += f\"  Products: {', '.join(products)}<br>\"\n",
+    "        reaction_output += f\"  additional_info: {', '.join(additional_str)}<br>\"\n",
+    "        reaction_output += f\"  <b>Full Reaction:</b> {full_reaction}<br>\"\n",
+    "        reaction_output += \"<br>\"\n",
+    "        detailed_output.append(reaction_output)\n",
+    "\n",
+    "        reaction_smiles = f\"{'.'.join(reactants)}>>{'.'.join(products_2)}\"\n",
+    "        smiles_output.append(reaction_smiles)\n",
+    "    return detailed_output, smiles_output\n",
+    "\n",
+    "\n",
+    "# 核心处理函数，仅使用 API Key 和图像\n",
+    "def process_chem_image(api_key, image):\n",
+    "    # 设置 API Key 环境变量，供 ChemEagle 使用\n",
+    "    os.environ[\"CHEMEAGLE_API_KEY\"] = api_key\n",
+    "\n",
+    "    # 保存上传图片\n",
+    "    image_path = \"temp_image.png\"\n",
+    "    image.save(image_path)\n",
+    "\n",
+    "    # 调用 ChemEagle（实现内部读取 os.getenv）\n",
+    "    chemeagle_result = ChemEagle(image_path)\n",
+    "\n",
+    "    # 解析输出\n",
+    "    detailed, smiles = parse_reactions(chemeagle_result)\n",
+    "\n",
+    "    # 写出 JSON\n",
+    "    json_path = \"output.json\"\n",
+    "    with open(json_path, 'w') as jf:\n",
+    "        json.dump(chemeagle_result, jf, indent=2)\n",
+    "\n",
+    "    # 返回 HTML、SMILES 合并文本、示意图、JSON 下载\n",
+    "    return \"\\n\\n\".join(detailed), smiles, example_diagram, json_path\n",
+    "\n",
+    "# 构建 Gradio 界面\n",
+    "with gr.Blocks() as demo:\n",
+    "    gr.Markdown(\n",
+    "        \"\"\"\n",
+    "        <center><h1>ChemEagle: A Multi-Agent System for Multimodal Chemical Information Extraction</h1></center>\n",
+    "        Upload a multimodal reaction image and type your OpenAI API key to extract multimodal chemical information.\n",
+    "        \"\"\"\n",
+    "    )\n",
+    "\n",
+    "    with gr.Row():\n",
+    "        # ———— 左侧：上传 + API Key + 按钮 ————\n",
+    "        with gr.Column(scale=1):\n",
+    "            image_input   = gr.Image(type=\"pil\", label=\"Upload a multimodal reaction image\")\n",
+    "            api_key_input = gr.Textbox(\n",
+    "                label=\"Your API-Key\",\n",
+    "                placeholder=\"Type your OpenAI_API_KEY\",\n",
+    "                type=\"password\"\n",
+    "            )\n",
+    "            with gr.Row():\n",
+    "                clear_btn = gr.Button(\"Clear\")\n",
+    "                run_btn   = gr.Button(\"Run\", elem_id=\"submit-btn\")\n",
+    "\n",
+    "        # ———— 中间：解析结果 + 示意图 ————\n",
+    "        with gr.Column(scale=1):\n",
+    "            gr.Markdown(\"### Parsed Reactions\")\n",
+    "            reaction_output   = gr.HTML(label=\"Detailed Reaction Output\")\n",
+    "            gr.Markdown(\"### Schematic Diagram\")\n",
+    "            schematic_diagram = gr.Image(value=example_diagram, label=\"示意图\")\n",
+    "\n",
+    "        # ———— 右侧：SMILES 拆分 & RDKit 渲染 + JSON 下载 ————\n",
+    "        with gr.Column(scale=1):\n",
+    "            gr.Markdown(\"### Machine-readable Output\")\n",
+    "            smiles_output = gr.Textbox(\n",
+    "                label=\"Reaction SMILES\",\n",
+    "                show_copy_button=True,\n",
+    "                interactive=False,\n",
+    "                visible=False\n",
+    "            )\n",
+    "\n",
+    "            @gr.render(inputs = smiles_output)  # 使用gr.render修饰器绑定输入和渲染逻辑\n",
+    "            def show_split(inputs):  # 定义处理和展示分割文本的函数\n",
+    "                if not inputs or isinstance(inputs, str) and inputs.strip() == \"\":  # 检查输入文本是否为空\n",
+    "                    return gr.Textbox(label= \"SMILES of Reaction i\"), gr.Image(value=rdkit_image, label= \"RDKit Image of Reaction i\",height=100)\n",
+    "                else:\n",
+    "                    # 假设输入是逗号分隔的 SMILES 字符串\n",
+    "                    smiles_list = inputs.split(\",\")\n",
+    "                    smiles_list = [re.sub(r\"^\\s*\\[?'?|'\\]?\\s*$\", \"\", item) for item in smiles_list]\n",
+    "                    components = []  # 初始化一个组件列表，用于存放每个 SMILES 对应的 Textbox 组件\n",
+    "                    for i, smiles in enumerate(smiles_list): \n",
+    "                        smiles.replace('\"', '').replace(\"'\", \"\").replace(\"[\", \"\").replace(\"]\", \"\")\n",
+    "                        rxn = rdChemReactions.ReactionFromSmarts(smiles, useSmiles=True)\n",
+    "                        \n",
+    "                        if rxn:\n",
+    "\n",
+    "                            new_rxn = AllChem.ChemicalReaction()\t\n",
+    "                            for mol in rxn.GetReactants():\n",
+    "                                mol = Chem.MolFromMolBlock(Chem.MolToMolBlock(mol))\n",
+    "                                new_rxn.AddReactantTemplate(mol)\n",
+    "                            for mol in rxn.GetProducts():\n",
+    "                                mol = Chem.MolFromMolBlock(Chem.MolToMolBlock(mol))\n",
+    "                                new_rxn.AddProductTemplate(mol)\n",
+    "\n",
+    "                            rxn = new_rxn\n",
+    "\n",
+    "                            def atom_mapping_remover(rxn):\n",
+    "                                for reactant in rxn.GetReactants():\n",
+    "                                    for atom in reactant.GetAtoms():\n",
+    "                                        atom.SetAtomMapNum(0)\n",
+    "                                for product in rxn.GetProducts():\n",
+    "                                    for atom in product.GetAtoms():\n",
+    "                                        atom.SetAtomMapNum(0)\n",
+    "                                return rxn\n",
+    "                            \n",
+    "                            atom_mapping_remover(rxn)\n",
+    "\n",
+    "                            reactant1 = rxn.GetReactantTemplate(0)\n",
+    "                            print(reactant1.GetNumBonds)\n",
+    "                            reactant2 = rxn.GetReactantTemplate(1) if rxn.GetNumReactantTemplates() > 1 else None\n",
+    "\n",
+    "                            if reactant1.GetNumBonds() > 0:\n",
+    "                                bond_length_reference = Draw.MeanBondLength(reactant1)\n",
+    "                            elif reactant2 and reactant2.GetNumBonds() > 0:\n",
+    "                                bond_length_reference = Draw.MeanBondLength(reactant2)\n",
+    "                            else:\n",
+    "                                bond_length_reference = 1.0 \n",
+    "\n",
+    "\n",
+    "                            drawer = rdMolDraw2D.MolDraw2DSVG(-1, -1)\n",
+    "                            dopts = drawer.drawOptions()\n",
+    "                            dopts.padding = 0.1 \n",
+    "                            dopts.includeRadicals = True\n",
+    "                            Draw.SetACS1996Mode(dopts, bond_length_reference*0.55)\n",
+    "                            dopts.bondLineWidth = 1.5\n",
+    "                            drawer.DrawReaction(rxn)\n",
+    "                            drawer.FinishDrawing()\n",
+    "                            svg_content = drawer.GetDrawingText()\n",
+    "                            svg_file = f\"reaction{i+1}.svg\"\n",
+    "                            with open(svg_file, \"w\") as f:\n",
+    "                                f.write(svg_content)\n",
+    "                            png_file = f\"reaction_{i+1}.png\"\n",
+    "                            cairosvg.svg2png(url=svg_file, write_to=png_file)\n",
+    "\n",
+    "\n",
+    "                        \n",
+    "                        components.append(gr.Textbox(value=smiles,label= f\"SMILES of Reaction {i}\", show_copy_button=True, interactive=False))\n",
+    "                        components.append(gr.Image(value=png_file,label= f\"RDKit Image of Reaction {i}\")) \n",
+    "                    return components  # 返回包含所有 SMILES Textbox 组件的列表\n",
+    "\n",
+    "            download_json = gr.File(label=\"Download JSON File\")\n",
+    "\n",
+    "\n",
+    "    gr.Examples(\n",
+    "        examples=[\n",
+    "            [\"examples/reaction1.jpg\", \"\"],\n",
+    "            [\"examples/reaction2.png\", \"\"],\n",
+    "            [\"examples/reaction3.png\", \"\"],\n",
+    "            [\"examples/reaction4.png\", \"\"],\n",
+    "            \n",
+    "            \n",
+    "        ],\n",
+    "        inputs=[image_input, api_key_input],\n",
+    "        outputs=[reaction_output, smiles_output, schematic_diagram, download_json],\n",
+    "        cache_examples=False,\n",
+    "        examples_per_page=4,\n",
+    "    )\n",
+    "\n",
+    "    # ———— 清空与运行 绑定 ————\n",
+    "    clear_btn.click(\n",
+    "        lambda: (None, None, None, None, None),\n",
+    "        inputs=[],\n",
+    "        outputs=[image_input, api_key_input, reaction_output, smiles_output, download_json]\n",
+    "    )\n",
+    "    run_btn.click(\n",
+    "        process_chem_image,\n",
+    "        inputs=[api_key_input, image_input],\n",
+    "        outputs=[reaction_output, smiles_output, schematic_diagram, download_json]\n",
+    "    )\n",
+    "\n",
+    "    # 自定义按钮样式\n",
+    "    demo.css = \"\"\"\n",
+    "    #submit-btn {\n",
+    "        background-color: #FF914D;\n",
+    "        color: white;\n",
+    "        font-weight: bold;\n",
+    "    }\n",
+    "    \"\"\"\n",
+    "\n",
+    "    demo.launch()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "openchemie",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

app.py ADDED Viewed

	@@ -0,0 +1,239 @@

+import os
+import gradio as gr
+import json
+from main import ChemEagle  # 支持 API key 通过环境变量
+from rdkit import Chem
+from rdkit.Chem import rdChemReactions
+from rdkit.Chem import Draw
+from rdkit.Chem import AllChem
+from rdkit.Chem.Draw import rdMolDraw2D
+import cairosvg
+import re
+import torch
+example_diagram = "examples/exp.png"
+rdkit_image = "examples/rdkit.png"
+# 解析 ChemEagle 返回的结构化数据
+def parse_reactions(output_json):
+    """
+    解析 JSON 格式的反应数据并格式化输出，包含颜色定制。
+    """
+    if isinstance(output_json, str):
+        reactions_data = json.loads(output_json)
+    elif isinstance(output_json, dict):
+        reactions_data = output_json  # 转换 JSON 字符串为字典
+    reactions_list = reactions_data.get("reactions", [])
+    detailed_output = []
+    smiles_output = []
+    for reaction in reactions_list:
+        reaction_id = reaction.get("reaction_id", "Unknown ID")
+        reactants = [r.get("smiles", "Unknown") for r in reaction.get("reactants", [])]
+        conditions = [
+            f"<span style='color:red'>{c.get('smiles', c.get('text', 'Unknown'))}[{c.get('role', 'Unknown')}]</span>"
+            for c in reaction.get("condition", [])
+        ]
+        conditions_1 = [
+            f"<span style='color:black'>{c.get('smiles', c.get('text', 'Unknown'))}[{c.get('role', 'Unknown')}]</span>"
+            for c in reaction.get("condition", [])
+        ]
+        products = [f"<span style='color:orange'>{p.get('smiles', 'Unknown')}</span>" for p in reaction.get("products", [])]
+        products_1 = [f"<span style='color:black'>{p.get('smiles', 'Unknown')}</span>" for p in reaction.get("products", [])]
+        products_2 = [r.get("smiles", "Unknown") for r in reaction.get("products", [])]
+        additional = reaction.get("additional_info", [])
+        additional_str = [str(x) for x in additional if x is not None]
+        tail = conditions_1 + additional_str
+        tail_str = ", ".join(tail)
+        # 构造反应的完整字符串，定制字体颜色
+        full_reaction = f"{'.'.join(reactants)}>>{'.'.join(products_1)} | {tail_str}"
+        full_reaction = f"<span style='color:black'>{full_reaction}</span>"
+        # 详细反应格式化输出
+        reaction_output = f"<b>Reaction: </b> {reaction_id}<br>"
+        reaction_output += f"  Reactants: <span style='color:blue'>{', '.join(reactants)}</span><br>"
+        reaction_output += f"  Conditions: {', '.join(conditions)}<br>"
+        reaction_output += f"  Products: {', '.join(products)}<br>"
+        reaction_output += f"  additional_info: {', '.join(additional_str)}<br>"
+        reaction_output += f"  <b>Full Reaction:</b> {full_reaction}<br>"
+        reaction_output += "<br>"
+        detailed_output.append(reaction_output)
+        reaction_smiles = f"{'.'.join(reactants)}>>{'.'.join(products_2)}"
+        smiles_output.append(reaction_smiles)
+    return detailed_output, smiles_output
+# 核心处理函数，仅使用 API Key 和图像
+def process_chem_image(api_key, image):
+    # 设置 API Key 环境变量，供 ChemEagle 使用
+    os.environ["CHEMEAGLE_API_KEY"] = api_key
+    # 保存上传图片
+    image_path = "temp_image.png"
+    image.save(image_path)
+    # 调用 ChemEagle（实现内部读取 os.getenv）
+    chemeagle_result = ChemEagle(image_path)
+    # 解析输出
+    detailed, smiles = parse_reactions(chemeagle_result)
+    # 写出 JSON
+    json_path = "output.json"
+    with open(json_path, 'w') as jf:
+        json.dump(chemeagle_result, jf, indent=2)
+    # 返回 HTML、SMILES 合并文本、示意图、JSON 下载
+    return "\n\n".join(detailed), smiles, example_diagram, json_path
+# 构建 Gradio 界面
+with gr.Blocks() as demo:
+    gr.Markdown(
+        """
+        <center><h1>ChemEagle: A Multi-Agent System for Multimodal Chemical Information Extraction</h1></center>
+        Upload a multimodal reaction image and type your OpenAI API key to extract multimodal chemical information.
+        """
+    )
+    with gr.Row():
+        # ———— 左侧：上传 + API Key + 按钮 ————
+        with gr.Column(scale=1):
+            image_input   = gr.Image(type="pil", label="Upload a multimodal reaction image")
+            api_key_input = gr.Textbox(
+                label="Your API-Key",
+                placeholder="Type your OpenAI_API_KEY",
+                type="password"
+            )
+            with gr.Row():
+                clear_btn = gr.Button("Clear")
+                run_btn   = gr.Button("Run", elem_id="submit-btn")
+        # ———— 中间：解析结果 + 示意图 ————
+        with gr.Column(scale=1):
+            gr.Markdown("### Parsed Reactions")
+            reaction_output   = gr.HTML(label="Detailed Reaction Output")
+            gr.Markdown("### Schematic Diagram")
+            schematic_diagram = gr.Image(value=example_diagram, label="示意图")
+        # ———— 右侧：SMILES 拆分 & RDKit 渲染 + JSON 下载 ————
+        with gr.Column(scale=1):
+            gr.Markdown("### Machine-readable Output")
+            smiles_output = gr.Textbox(
+                label="Reaction SMILES",
+                show_copy_button=True,
+                interactive=False,
+                visible=False
+            )
+            @gr.render(inputs = smiles_output)  # 使用gr.render修饰器绑定输入和渲染逻辑
+            def show_split(inputs):  # 定义处理和展示分割文本的函数
+                if not inputs or isinstance(inputs, str) and inputs.strip() == "":  # 检查输入文本是否为空
+                    return gr.Textbox(label= "SMILES of Reaction i"), gr.Image(value=rdkit_image, label= "RDKit Image of Reaction i",height=100)
+                else:
+                    # 假设输入是逗号分隔的 SMILES 字符串
+                    smiles_list = inputs.split(",")
+                    smiles_list = [re.sub(r"^\s*\[?'?|'\]?\s*$", "", item) for item in smiles_list]
+                    components = []  # 初始化一个组件列表，用于存放每个 SMILES 对应的 Textbox 组件
+                    for i, smiles in enumerate(smiles_list):
+                        smiles.replace('"', '').replace("'", "").replace("[", "").replace("]", "")
+                        rxn = rdChemReactions.ReactionFromSmarts(smiles, useSmiles=True)
+                        if rxn:
+                            new_rxn = AllChem.ChemicalReaction()
+                            for mol in rxn.GetReactants():
+                                mol = Chem.MolFromMolBlock(Chem.MolToMolBlock(mol))
+                                new_rxn.AddReactantTemplate(mol)
+                            for mol in rxn.GetProducts():
+                                mol = Chem.MolFromMolBlock(Chem.MolToMolBlock(mol))
+                                new_rxn.AddProductTemplate(mol)
+                            rxn = new_rxn
+                            def atom_mapping_remover(rxn):
+                                for reactant in rxn.GetReactants():
+                                    for atom in reactant.GetAtoms():
+                                        atom.SetAtomMapNum(0)
+                                for product in rxn.GetProducts():
+                                    for atom in product.GetAtoms():
+                                        atom.SetAtomMapNum(0)
+                                return rxn
+                            atom_mapping_remover(rxn)
+                            reactant1 = rxn.GetReactantTemplate(0)
+                            print(reactant1.GetNumBonds)
+                            reactant2 = rxn.GetReactantTemplate(1) if rxn.GetNumReactantTemplates() > 1 else None
+                            if reactant1.GetNumBonds() > 0:
+                                bond_length_reference = Draw.MeanBondLength(reactant1)
+                            elif reactant2 and reactant2.GetNumBonds() > 0:
+                                bond_length_reference = Draw.MeanBondLength(reactant2)
+                            else:
+                                bond_length_reference = 1.0
+                            drawer = rdMolDraw2D.MolDraw2DSVG(-1, -1)
+                            dopts = drawer.drawOptions()
+                            dopts.padding = 0.1
+                            dopts.includeRadicals = True
+                            Draw.SetACS1996Mode(dopts, bond_length_reference*0.55)
+                            dopts.bondLineWidth = 1.5
+                            drawer.DrawReaction(rxn)
+                            drawer.FinishDrawing()
+                            svg_content = drawer.GetDrawingText()
+                            svg_file = f"reaction{i+1}.svg"
+                            with open(svg_file, "w") as f:
+                                f.write(svg_content)
+                            png_file = f"reaction_{i+1}.png"
+                            cairosvg.svg2png(url=svg_file, write_to=png_file)
+                        components.append(gr.Textbox(value=smiles,label= f"SMILES of Reaction {i}", show_copy_button=True, interactive=False))
+                        components.append(gr.Image(value=png_file,label= f"RDKit Image of Reaction {i}"))
+                    return components  # 返回包含所有 SMILES Textbox 组件的列表
+            download_json = gr.File(label="Download JSON File")
+    gr.Examples(
+        examples=[
+            ["examples/reaction1.jpg", ""],
+            ["examples/reaction2.png", ""],
+            ["examples/reaction3.png", ""],
+            ["examples/reaction4.png", ""],
+        ],
+        inputs=[image_input, api_key_input],
+        outputs=[reaction_output, smiles_output, schematic_diagram, download_json],
+        cache_examples=False,
+        examples_per_page=4,
+    )
+    # ———— 清空与运行 绑定 ————
+    clear_btn.click(
+        lambda: (None, None, None, None, None),
+        inputs=[],
+        outputs=[image_input, api_key_input, reaction_output, smiles_output, download_json]
+    )
+    run_btn.click(
+        process_chem_image,
+        inputs=[api_key_input, image_input],
+        outputs=[reaction_output, smiles_output, schematic_diagram, download_json]
+    )
+    # 自定义按钮样式
+    demo.css = """
+    #submit-btn {
+        background-color: #FF914D;
+        color: white;
+        font-weight: bold;
+    }
+    """
+    demo.launch()

chemiener/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .interface import ChemNER

chemiener/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (187 Bytes). View file

chemiener/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (185 Bytes). View file

chemiener/__pycache__/dataset.cpython-310.pyc ADDED Viewed

Binary file (5.37 kB). View file

chemiener/__pycache__/dataset.cpython-38.pyc ADDED Viewed

Binary file (5.35 kB). View file

chemiener/__pycache__/interface.cpython-310.pyc ADDED Viewed

Binary file (4.46 kB). View file

chemiener/__pycache__/interface.cpython-38.pyc ADDED Viewed

Binary file (4.47 kB). View file

chemiener/__pycache__/model.cpython-310.pyc ADDED Viewed

Binary file (684 Bytes). View file

chemiener/__pycache__/model.cpython-38.pyc ADDED Viewed

Binary file (680 Bytes). View file

chemiener/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (1.67 kB). View file

chemiener/__pycache__/utils.cpython-38.pyc ADDED Viewed

Binary file (1.53 kB). View file

chemiener/dataset.py ADDED Viewed

	@@ -0,0 +1,172 @@

+import os
+import cv2
+import copy
+import random
+import json
+import contextlib
+import numpy as np
+import pandas as pd
+import torch
+import torch.nn.functional as F
+from torch.utils.data import DataLoader, Dataset
+from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
+from transformers import BertTokenizerFast, AutoTokenizer, RobertaTokenizerFast
+from .utils import get_class_to_index
+class NERDataset(Dataset):
+    def __init__(self, args, data_file, split='train'):
+        super().__init__()
+        self.args = args
+        if data_file:
+            data_path = os.path.join(args.data_path, data_file)
+            with open(data_path) as f:
+                self.data = json.load(f)
+            self.name = os.path.basename(data_file).split('.')[0]
+        self.split = split
+        self.is_train = (split == 'train')
+        self.tokenizer = AutoTokenizer.from_pretrained(self.args.roberta_checkpoint, cache_dir = self.args.cache_dir)#BertTokenizerFast.from_pretrained('allenai/scibert_scivocab_uncased')
+        self.class_to_index = get_class_to_index(self.args.corpus)
+        self.index_to_class = {self.class_to_index[key]: key for key in self.class_to_index}
+    #commment
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        text_tokenized = self.tokenizer(self.data[str(idx)]['text'], truncation = True, max_length = self.args.max_seq_length)
+        if len(text_tokenized['input_ids']) > 512: print(len(text_tokenized['input_ids']))
+        text_tokenized_untruncated = self.tokenizer(self.data[str(idx)]['text'])
+        return text_tokenized, self.align_labels(text_tokenized, self.data[str(idx)]['entities'], len(self.data[str(idx)]['text'])), self.align_labels(text_tokenized_untruncated, self.data[str(idx)]['entities'], len(self.data[str(idx)]['text']))
+    def align_labels(self, text_tokenized, entities, length):
+        char_to_class = {}
+        for entity in entities:
+            for span in entities[entity]["span"]:
+                for i in range(span[0], span[1]):
+                    char_to_class[i] = self.class_to_index[('B-' if i == span[0] else 'I-')+str(entities[entity]["type"])]
+        for i in range(length):
+            if i not in char_to_class:
+                char_to_class[i] = 0
+        classes = []
+        for i in range(len(text_tokenized[0])):
+            span = text_tokenized.token_to_chars(i)
+            if span is not None:
+                classes.append(char_to_class[span.start])
+            else:
+                classes.append(-100)
+        return torch.LongTensor(classes)
+    def make_html(word_tokens, predictions):
+        toreturn = '''<!DOCTYPE html>
+    <html>
+    <head>
+        <title>Named Entity Recognition Visualization</title>
+        <style>
+            .EXAMPLE_LABEL {
+                color: red;
+                text-decoration: underline red;
+            }
+            .REACTION_PRODUCT {
+                color: orange;
+                text-decoration: underline orange;
+            }
+            .STARTING_MATERIAL {
+                color: gold;
+                text-decoration: underline gold;
+            }
+            .REAGENT_CATALYST {
+                color: green;
+                text-decoration: underline green;
+            }
+            .SOLVENT {
+                color: cyan;
+                text-decoration: underline cyan;
+            }
+            .OTHER_COMPOUND {
+                color: blue;
+                text-decoration: underline blue;
+            }
+            .TIME {
+                color: purple;
+                text-decoration: underline purple;
+            }
+            .TEMPERATURE {
+                color: magenta;
+                text-decoration: underline magenta;
+            }
+            .YIELD_OTHER {
+                color: palegreen;
+                text-decoration: underline palegreen;
+            }
+            .YIELD_PERCENT {
+                color: pink;
+                text-decoration: underline pink;
+            }
+        </style>
+    </head>
+    <body>
+        <p>'''
+        last_label = None
+        for idx, item in enumerate(word_tokens):
+            decoded = self.tokenizer.decode(item, skip_special_tokens = True)
+            if len(decoded)>0:
+                if idx!=0 and decoded[0]!='#':
+                    toreturn+=" "
+                label = predictions[idx]
+                if label == last_label:
+                    toreturn+=decoded if decoded[0]!="#" else decoded[2:]
+                else:
+                    if last_label is not None and last_label>0:
+                        toreturn+="</u>"
+                    if label >0:
+                        toreturn+="<u class=\""
+                        toreturn+=self.index_to_class[label]
+                        toreturn+="\">"
+                        toreturn+=decoded if decoded[0]!="#" else decoded[2:]
+                    if label == 0:
+                        toreturn+=decoded if decoded[0]!="#" else decoded[2:]
+                if idx==len(word_tokens) and label>0:
+                    toreturn+="</u>"
+                last_label = label
+        toreturn += '''    </p>
+        </body>
+        </html>'''
+        return toreturn
+def get_collate_fn():
+    def collate(batch):
+        sentences = []
+        masks = []
+        refs = []
+        for ex in batch:
+            sentences.append(torch.LongTensor(ex[0]['input_ids']))
+            masks.append(torch.Tensor(ex[0]['attention_mask']))
+            refs.append(ex[1])
+        sentences = pad_sequence(sentences, batch_first = True, padding_value = 0)
+        masks = pad_sequence(masks, batch_first = True, padding_value = 0)
+        refs = pad_sequence(refs, batch_first = True, padding_value = -100)
+        return sentences, masks, refs
+    return collate

chemiener/interface.py ADDED Viewed

	@@ -0,0 +1,124 @@

+import os
+import argparse
+from typing import List
+import torch
+import numpy as np
+from .model import build_model
+from .dataset import NERDataset, get_collate_fn
+from huggingface_hub import hf_hub_download
+from .utils import get_class_to_index
+class ChemNER:
+    def __init__(self, model_path, device = None, cache_dir = None):
+        self.args = self._get_args(cache_dir)
+        states = torch.load(model_path, map_location = torch.device('cpu'))
+        if device is None:
+            device = torch.device('cpu')
+        self.device = device
+        self.model = self.get_model(self.args, device, states['state_dict'])
+        self.collate = get_collate_fn()
+        self.dataset = NERDataset(self.args, data_file = None)
+        self.class_to_index = get_class_to_index(self.args.corpus)
+        self.index_to_class = {self.class_to_index[key]: key for key in self.class_to_index}
+    def _get_args(self, cache_dir):
+        parser = argparse.ArgumentParser()
+        parser.add_argument('--roberta_checkpoint', default = 'dmis-lab/biobert-large-cased-v1.1', type=str, help='which roberta config to use')
+        parser.add_argument('--corpus', default = "chemdner", type=str, help="which corpus should the tags be from")
+        args = parser.parse_args([])
+        args.cache_dir = cache_dir
+        return args
+    def get_model(self, args, device, model_states):
+        model = build_model(args)
+        def remove_prefix(state_dict):
+            return {k.replace('model.', ''): v for k, v in state_dict.items()}
+        model.load_state_dict(remove_prefix(model_states), strict = False)
+        model.to(device)
+        model.eval()
+        return model
+    def predict_strings(self, strings: List, batch_size = 8):
+        device = self.device
+        predictions = []
+        def prepare_output(char_span, prediction):
+            toreturn = []
+            i = 0
+            while i < len(char_span):
+                if prediction[i][0] == 'B':
+                    toreturn.append((prediction[i][2:], [char_span[i].start, char_span[i].end]))
+                elif len(toreturn) > 0 and prediction[i][2:] == toreturn[-1][0]:
+                    toreturn[-1] = (toreturn[-1][0], [toreturn[-1][1][0], char_span[i].end])
+                i += 1
+            return toreturn
+        output = []
+        for idx in range(0, len(strings), batch_size):
+            batch_strings = strings[idx:idx+batch_size]
+            batch_strings_tokenized = [(self.dataset.tokenizer(s, truncation = True, max_length = 512),  torch.Tensor([-1]), torch.Tensor([-1]) ) for s in batch_strings]
+            sentences, masks, refs = self.collate(batch_strings_tokenized)
+            predictions = self.model(input_ids = sentences.to(device), attention_mask = masks.to(device))[0].argmax(dim = 2).to('cpu')
+            sentences_list = list(sentences)
+            predictions_list = list(predictions)
+            char_spans = []
+            for j, sentence in enumerate(sentences_list):
+                to_add = [batch_strings_tokenized[j][0].token_to_chars(i) for i, word in enumerate(sentence) if len(self.dataset.tokenizer.decode(int(word.item()), skip_special_tokens = True)) > 0 ]
+                char_spans.append(to_add)
+            class_predictions = [[self.index_to_class[int(pred.item())] for (pred, word) in zip(sentence_p, sentence_w) if len(self.dataset.tokenizer.decode(int(word.item()), skip_special_tokens = True)) > 0] for (sentence_p, sentence_w) in zip(predictions_list, sentences_list)]
+            output+=[prepare_output(char_span, prediction) for char_span, prediction in zip(char_spans, class_predictions)]
+        return output

chemiener/main.py ADDED Viewed

	@@ -0,0 +1,345 @@

+import os
+import math
+import json
+import random
+import argparse
+import numpy as np
+import time
+import torch
+from torch.profiler import profile, record_function, ProfilerActivity
+import torch.distributed as dist
+import pytorch_lightning as pl
+from pytorch_lightning import LightningModule, LightningDataModule
+from pytorch_lightning.callbacks import LearningRateMonitor
+from pytorch_lightning.strategies.ddp import DDPStrategy
+from transformers import get_scheduler
+import transformers
+from dataset import NERDataset, get_collate_fn
+from model import build_model
+from utils import get_class_to_index
+import evaluate
+from seqeval.metrics import accuracy_score
+from seqeval.metrics import classification_report
+from seqeval.metrics import f1_score
+from seqeval.scheme import IOB2
+def get_args(notebook=False):
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--do_train', action='store_true')
+    parser.add_argument('--do_valid', action='store_true')
+    parser.add_argument('--do_test', action='store_true')
+    parser.add_argument('--fp16', action='store_true')
+    parser.add_argument('--seed', type=int, default=42)
+    parser.add_argument('--gpus', type=int, default=1)
+    parser.add_argument('--print_freq', type=int, default=200)
+    parser.add_argument('--debug', action='store_true')
+    parser.add_argument('--no_eval', action='store_true')
+    # Data
+    parser.add_argument('--data_path', type=str, default=None)
+    parser.add_argument('--image_path', type=str, default=None)
+    parser.add_argument('--train_file', type=str, default=None)
+    parser.add_argument('--valid_file', type=str, default=None)
+    parser.add_argument('--test_file', type=str, default=None)
+    parser.add_argument('--vocab_file', type=str, default=None)
+    parser.add_argument('--format', type=str, default='reaction')
+    parser.add_argument('--num_workers', type=int, default=8)
+    parser.add_argument('--input_size', type=int, default=224)
+    # Training
+    parser.add_argument('--epochs', type=int, default=8)
+    parser.add_argument('--batch_size', type=int, default=256)
+    parser.add_argument('--lr', type=float, default=1e-4)
+    parser.add_argument('--weight_decay', type=float, default=0.05)
+    parser.add_argument('--max_grad_norm', type=float, default=5.)
+    parser.add_argument('--scheduler', type=str, choices=['cosine', 'constant'], default='cosine')
+    parser.add_argument('--warmup_ratio', type=float, default=0)
+    parser.add_argument('--gradient_accumulation_steps', type=int, default=1)
+    parser.add_argument('--load_path', type=str, default=None)
+    parser.add_argument('--load_encoder_only', action='store_true')
+    parser.add_argument('--train_steps_per_epoch', type=int, default=-1)
+    parser.add_argument('--eval_per_epoch', type=int, default=10)
+    parser.add_argument('--save_path', type=str, default='output/')
+    parser.add_argument('--save_mode', type=str, default='best', choices=['best', 'all', 'last'])
+    parser.add_argument('--load_ckpt', type=str, default='best')
+    parser.add_argument('--resume', action='store_true')
+    parser.add_argument('--num_train_example', type=int, default=None)
+    parser.add_argument('--roberta_checkpoint', type=str, default = "roberta-base")
+    parser.add_argument('--corpus', type=str, default = "chemu")
+    parser.add_argument('--cache_dir')
+    parser.add_argument('--eval_truncated', action='store_true')
+    parser.add_argument('--max_seq_length', type = int, default=512)
+    args = parser.parse_args([]) if notebook else parser.parse_args()
+    return args
+class ChemIENERecognizer(LightningModule):
+    def __init__(self, args):
+        super().__init__()
+        self.args = args
+        self.model = build_model(args)
+        self.validation_step_outputs = []
+    def training_step(self, batch, batch_idx):
+        sentences, masks, refs,_ = batch
+        '''
+        print("sentences " + str(sentences))
+        print("sentence shape " + str(sentences.shape))
+        print("masks " + str(masks))
+        print("masks shape " + str(masks.shape))
+        print("refs " + str(refs))
+        print("refs shape " + str(refs.shape))
+        '''
+        loss, logits = self.model(input_ids=sentences, attention_mask=masks, labels=refs)
+        self.log('train/loss', loss)
+        self.log('lr', self.lr_schedulers().get_lr()[0], prog_bar=True, logger=False)
+        return loss
+    def validation_step(self, batch, batch_idx):
+        sentences, masks, refs, untruncated = batch
+        '''
+        print("sentences " + str(sentences))
+        print("sentence shape " + str(sentences.shape))
+        print("masks " + str(masks))
+        print("masks shape " + str(masks.shape))
+        print("refs " + str(refs))
+        print("refs shape " + str(refs.shape))
+        '''
+        logits = self.model(input_ids = sentences, attention_mask=masks)[0]
+        '''
+        print("logits " + str(logits))
+        print(sentences.shape)
+        print(logits.shape)
+        print(torch.eq(logits.argmax(dim = 2), refs).sum())
+        '''
+        self.validation_step_outputs.append((sentences.to("cpu"), logits.argmax(dim = 2).to("cpu"), refs.to('cpu'), untruncated.to("cpu")))
+    def on_validation_epoch_end(self):
+        if self.trainer.num_devices > 1:
+            gathered_outputs = [None for i in range(self.trainer.num_devices)]
+            dist.all_gather_object(gathered_outputs, self.validation_step_outputs)
+            gathered_outputs = sum(gathered_outputs, [])
+        else:
+            gathered_outputs = self.validation_step_outputs
+        sentences = [list(output[0]) for output in gathered_outputs]
+        class_to_index = get_class_to_index(self.args.corpus)
+        index_to_class = {class_to_index[key]: key for key in class_to_index}
+        predictions = [list(output[1]) for output in gathered_outputs]
+        labels = [list(output[2]) for output in gathered_outputs]
+        untruncateds = [list(output[3]) for output in gathered_outputs]
+        untruncateds = [[index_to_class[int(label.item())] for label in sentence if int(label.item()) != -100] for batched in untruncateds for sentence in batched]
+        output = {"sentences": [[int(word.item()) for (word, label) in zip(sentence_w, sentence_l) if label != -100] for (batched_w, batched_l) in zip(sentences, labels) for (sentence_w, sentence_l) in zip(batched_w, batched_l) ],
+                  "predictions": [[index_to_class[int(pred.item())] for (pred, label) in zip(sentence_p, sentence_l) if label!=-100] for (batched_p, batched_l) in zip(predictions, labels) for (sentence_p, sentence_l) in zip(batched_p, batched_l)  ],
+                  "groundtruth": [[index_to_class[int(label.item())] for label in sentence if label != -100] for batched in labels for sentence in batched]}
+        #true_labels = [str(label.item()) for batched in labels for sentence in batched for label in sentence if label != -100]
+        #true_predictions = [str(pred.item()) for (batched_p, batched_l) in zip(predictions, labels) for (sentence_p, sentence_l) in zip(batched_p, batched_l) for (pred, label) in zip(sentence_p, sentence_l) if label!=-100 ]
+        #print("true_label " + str(len(true_labels)) + " true_predictions "+str(len(true_predictions)))
+        #predictions = utils.merge_predictions(gathered_outputs)
+        name = self.eval_dataset.name
+        scores = [0]
+        #print(predictions)
+        #print(predictions[0].shape)
+        if self.trainer.is_global_zero:
+            if not self.args.no_eval:
+                epoch = self.trainer.current_epoch
+                metric = evaluate.load("seqeval", cache_dir = self.args.cache_dir)
+                predictions = [ preds + ['O'] * (len(full_groundtruth) - len(preds)) for (preds, full_groundtruth) in zip(output['predictions'], untruncateds)]
+                all_metrics = metric.compute(predictions = predictions, references = untruncateds)
+                #accuracy = sum([1 if p == l else 0 for (p, l) in zip(true_predictions, true_labels)])/len(true_labels)
+                #precision = torch.eq(self.eval_dataset.data, predictions.argmax(dim = 1)).sum().float()/self.eval_dataset.data.numel()
+                #self.print("Epoch: "+str(epoch)+" accuracy: "+str(accuracy))
+                if self.args.eval_truncated:
+                    report = classification_report(output['groundtruth'], output['predictions'], mode = 'strict', scheme = IOB2, output_dict = True)
+                else:
+                    #report = classification_report(predictions, untruncateds, output_dict = True)#, mode = 'strict', scheme = IOB2, output_dict = True)
+                    report = classification_report(predictions, untruncateds, mode = 'strict', scheme = IOB2, output_dict = True)
+                self.print(report)
+                #self.print("______________________________________________")
+                #self.print(report_strict)
+                scores = [report['micro avg']['f1-score']]
+            with open(os.path.join(self.trainer.default_root_dir, f'prediction_{name}.json'), 'w') as f:
+                    json.dump(output, f)
+        dist.broadcast_object_list(scores)
+        self.log('val/score', scores[0], prog_bar=True, rank_zero_only=True)
+        self.validation_step_outputs.clear()
+        self.validation_step_outputs.clear()
+    def configure_optimizers(self):
+        num_training_steps = self.trainer.num_training_steps
+        self.print(f'Num training steps: {num_training_steps}')
+        num_warmup_steps = int(num_training_steps * self.args.warmup_ratio)
+        optimizer = torch.optim.AdamW(self.parameters(), lr=self.args.lr, weight_decay=self.args.weight_decay)
+        scheduler = get_scheduler(self.args.scheduler, optimizer, num_warmup_steps, num_training_steps)
+        return {'optimizer': optimizer, 'lr_scheduler': {'scheduler': scheduler, 'interval': 'step'}}
+class NERDataModule(LightningDataModule):
+    def __init__(self, args):
+        super().__init__()
+        self.args = args
+        self.collate_fn = get_collate_fn()
+    def prepare_data(self):
+        args = self.args
+        if args.do_train:
+            self.train_dataset = NERDataset(args, args.train_file, split='train')
+        if self.args.do_train or self.args.do_valid:
+            self.val_dataset = NERDataset(args, args.valid_file, split='valid')
+        if self.args.do_test:
+            self.test_dataset = NERDataset(args, args.test_file, split='valid')
+    def print_stats(self):
+        if self.args.do_train:
+            print(f'Train dataset: {len(self.train_dataset)}')
+        if self.args.do_train or self.args.do_valid:
+            print(f'Valid dataset: {len(self.val_dataset)}')
+        if self.args.do_test:
+            print(f'Test dataset: {len(self.test_dataset)}')
+    def train_dataloader(self):
+        return torch.utils.data.DataLoader(
+            self.train_dataset, batch_size=self.args.batch_size, num_workers=self.args.num_workers,
+            collate_fn=self.collate_fn)
+    def val_dataloader(self):
+        return torch.utils.data.DataLoader(
+            self.val_dataset, batch_size=self.args.batch_size, num_workers=self.args.num_workers,
+            collate_fn=self.collate_fn)
+    def test_dataloader(self):
+        return torch.utils.data.DataLoader(
+            self.test_dataset, batch_size=self.args.batch_size, num_workers=self.args.num_workers,
+            collate_fn=self.collate_fn)
+class ModelCheckpoint(pl.callbacks.ModelCheckpoint):
+    def _get_metric_interpolated_filepath_name(self, monitor_candidates, trainer, del_filepath=None) -> str:
+        filepath = self.format_checkpoint_name(monitor_candidates)
+        return filepath
+def main():
+    transformers.utils.logging.set_verbosity_error()
+    args = get_args()
+    pl.seed_everything(args.seed, workers = True)
+    if args.do_train:
+        model = ChemIENERecognizer(args)
+    else:
+        model = ChemIENERecognizer.load_from_checkpoint(os.path.join(args.save_path, 'checkpoints/best.ckpt'), strict=False,
+                                        args=args)
+    dm = NERDataModule(args)
+    dm.prepare_data()
+    dm.print_stats()
+    checkpoint = ModelCheckpoint(monitor='val/score', mode='max', save_top_k=1, filename='best', save_last=True)
+    # checkpoint = ModelCheckpoint(monitor=None, save_top_k=0, save_last=True)
+    lr_monitor = LearningRateMonitor(logging_interval='step')
+    logger = pl.loggers.TensorBoardLogger(args.save_path, name='', version='')
+    trainer = pl.Trainer(
+        strategy=DDPStrategy(find_unused_parameters=False),
+        accelerator='gpu',
+        precision = 16,
+        devices=args.gpus,
+        logger=logger,
+        default_root_dir=args.save_path,
+        callbacks=[checkpoint, lr_monitor],
+        max_epochs=args.epochs,
+        gradient_clip_val=args.max_grad_norm,
+        accumulate_grad_batches=args.gradient_accumulation_steps,
+        check_val_every_n_epoch=args.eval_per_epoch,
+        log_every_n_steps=10,
+        deterministic='warn')
+    if args.do_train:
+        trainer.num_training_steps = math.ceil(
+            len(dm.train_dataset) / (args.batch_size * args.gpus * args.gradient_accumulation_steps)) * args.epochs
+        model.eval_dataset = dm.val_dataset
+        ckpt_path = os.path.join(args.save_path, 'checkpoints/last.ckpt') if args.resume else None
+        trainer.fit(model, datamodule=dm, ckpt_path=ckpt_path)
+        model = ChemIENERecognizer.load_from_checkpoint(checkpoint.best_model_path, args=args)
+    if args.do_valid:
+        model.eval_dataset = dm.val_dataset
+        trainer.validate(model, datamodule=dm)
+    if args.do_test:
+        model.test_dataset = dm.test_dataset
+        trainer.test(model, datamodule=dm)
+if __name__ == "__main__":
+    main()

chemiener/model.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import torch
+from torch import nn
+from transformers import BertForTokenClassification, RobertaForTokenClassification, AutoModelForTokenClassification
+def build_model(args):
+    if args.corpus == "chemu":
+        return AutoModelForTokenClassification.from_pretrained(args.roberta_checkpoint, num_labels = 21, cache_dir = args.cache_dir, return_dict = False)
+    elif args.corpus == "chemdner":
+        return AutoModelForTokenClassification.from_pretrained(args.roberta_checkpoint, num_labels = 17, cache_dir = args.cache_dir, return_dict = False)
+    elif args.corpus == "chemdner-mol":
+        return AutoModelForTokenClassification.from_pretrained(args.roberta_checkpoint, num_labels = 3, cache_dir = args.cache_dir, return_dict = False)

chemiener/utils.py ADDED Viewed

	@@ -0,0 +1,23 @@

+def merge_predictions(results):
+    if len(results) == 0:
+        return []
+    predictions = {}
+    for batch_preds in results:
+        for idx, preds in enumerate(batch_preds):
+            predictions[idx] = preds
+    predictions = [predictions[i] for i in range(len(predictions))]
+    return predictions
+def get_class_to_index(corpus):
+    if corpus == "chemu":
+        return {'B-EXAMPLE_LABEL': 1, 'B-REACTION_PRODUCT': 2, 'B-STARTING_MATERIAL': 3, 'B-REAGENT_CATALYST': 4, 'B-SOLVENT': 5, 'B-OTHER_COMPOUND': 6, 'B-TIME': 7, 'B-TEMPERATURE': 8, 'B-YIELD_OTHER': 9, 'B-YIELD_PERCENT': 10, 'O': 0,
+            'I-EXAMPLE_LABEL': 11, 'I-REACTION_PRODUCT': 12, 'I-STARTING_MATERIAL': 13, 'I-REAGENT_CATALYST': 14, 'I-SOLVENT': 15, 'I-OTHER_COMPOUND': 16, 'I-TIME': 17, 'I-TEMPERATURE': 18, 'I-YIELD_OTHER': 19, 'I-YIELD_PERCENT': 20}
+    elif corpus == "chemdner":
+        return {'O': 0, 'B-ABBREVIATION': 1, 'B-FAMILY': 2,  'B-FORMULA': 3, 'B-IDENTIFIER': 4, 'B-MULTIPLE': 5, 'B-SYSTEMATIC': 6, 'B-TRIVIAL': 7, 'B-NO CLASS': 8, 'I-ABBREVIATION': 9, 'I-FAMILY': 10,  'I-FORMULA': 11, 'I-IDENTIFIER': 12, 'I-MULTIPLE': 13, 'I-SYSTEMATIC': 14, 'I-TRIVIAL': 15, 'I-NO CLASS': 16}
+    elif corpus == "chemdner-mol":
+        return {'O': 0, 'B-MOL': 1, 'I-MOL': 2}

chemietoolkit/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .interface import ChemIEToolkit

chemietoolkit/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (198 Bytes). View file

chemietoolkit/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (189 Bytes). View file

chemietoolkit/__pycache__/chemrxnextractor.cpython-310.pyc ADDED Viewed

Binary file (3.66 kB). View file

chemietoolkit/__pycache__/chemrxnextractor.cpython-38.pyc ADDED Viewed

Binary file (3.62 kB). View file

chemietoolkit/__pycache__/interface.cpython-310.pyc ADDED Viewed

Binary file (29.3 kB). View file

chemietoolkit/__pycache__/interface.cpython-38.pyc ADDED Viewed

Binary file (30 kB). View file

chemietoolkit/__pycache__/tableextractor.cpython-310.pyc ADDED Viewed

Binary file (10.3 kB). View file

chemietoolkit/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (25 kB). View file

chemietoolkit/chemrxnextractor.py ADDED Viewed

	@@ -0,0 +1,107 @@

+from PyPDF2 import PdfReader, PdfWriter
+import pdfminer.high_level
+import pdfminer.layout
+from operator import itemgetter
+import os
+import pdftotext
+from chemrxnextractor import RxnExtractor
+class ChemRxnExtractor(object):
+    def __init__(self, pdf, pn, model_dir, device):
+        self.pdf_file = pdf
+        self.pages = pn
+        self.model_dir = os.path.join(model_dir, "cre_models_v0.1") # directory saving both prod and role models
+        use_cuda = (device == 'cuda')
+        self.rxn_extractor = RxnExtractor(self.model_dir, use_cuda=use_cuda)
+        self.text_file = "info.txt"
+        self.pdf_text = ""
+        if len(self.pdf_file) > 0:
+            with open(self.pdf_file, "rb") as f:
+                self.pdf_text = pdftotext.PDF(f)
+    def set_pdf_file(self, pdf):
+        self.pdf_file = pdf
+        with open(self.pdf_file, "rb") as f:
+            self.pdf_text = pdftotext.PDF(f)
+    def set_pages(self, pn):
+        self.pages = pn
+    def set_model_dir(self, md):
+        self.model_dir = md
+        self.rxn_extractor = RxnExtractor(self.model_dir)
+    def set_text_file(self, tf):
+        self.text_file = tf
+    def extract_reactions_from_text(self):
+        if self.pages is None:
+            return self.extract_all(len(self.pdf_text))
+        else:
+            return self.extract_all(self.pages)
+    def extract_all(self, pages):
+        ans = []
+        text = self.get_paragraphs_from_pdf(pages)
+        for data in text:
+            L = [sent for paragraph in data['paragraphs'] for sent in paragraph]
+            reactions = self.get_reactions(L, page_number=data['page'])
+            ans.append(reactions)
+        return ans
+    def get_reactions(self, sents, page_number=None):
+        rxns = self.rxn_extractor.get_reactions(sents)
+        ret = []
+        for r in rxns:
+            if len(r['reactions']) != 0: ret.append(r)
+        ans = {}
+        ans.update({'page' : page_number})
+        ans.update({'reactions' : ret})
+        return ans
+    def get_paragraphs_from_pdf(self, pages):
+        current_page_num = 1
+        if pages is None:
+            pages = len(self.pdf_text)
+        result = []
+        for page in range(pages):
+            content = self.pdf_text[page]
+            pg = content.split("\n\n")
+            L = []
+            for line in pg:
+                paragraph = []
+                if '\x0c' in line:
+                    continue
+                text = line
+                text = text.replace("\n", " ")
+                text = text.replace("- ", "-")
+                curind = 0
+                i = 0
+                while i < len(text):
+                    if text[i] == '.':
+                        if i != 0 and not text[i-1].isdigit() or i != len(text) - 1 and (text[i+1] == " " or text[i+1] == "\n"):
+                            paragraph.append(text[curind:i+1] + "\n")
+                            while(i < len(text) and text[i] != " "):
+                                i += 1
+                            curind = i + 1
+                    i += 1
+                if curind != i:
+                    if text[i - 1] == " ":
+                        if i != 1:
+                            i -= 1
+                        else:
+                            break
+                    if text[i - 1] != '.':
+                        paragraph.append(text[curind:i] + ".\n")
+                    else:
+                        paragraph.append(text[curind:i] + "\n")
+                L.append(paragraph)
+            result.append({
+                'paragraphs': L,
+                'page': current_page_num
+            })
+            current_page_num += 1
+        return result

chemietoolkit/interface.py ADDED Viewed

	@@ -0,0 +1,749 @@

+import torch
+import re
+from functools import lru_cache
+import layoutparser as lp
+import pdf2image
+from PIL import Image
+from huggingface_hub import hf_hub_download, snapshot_download
+from molscribe import MolScribe
+from rxnscribe import RxnScribe, MolDetect
+from chemiener import ChemNER
+from .chemrxnextractor import ChemRxnExtractor
+from .tableextractor import TableExtractor
+from .utils import *
+class ChemIEToolkit:
+    def __init__(self, device=None):
+        if device is None:
+            self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+        else:
+            self.device = torch.device(device)
+        self._molscribe = None
+        self._rxnscribe = None
+        self._pdfparser = None
+        self._moldet = None
+        self._chemrxnextractor = None
+        self._chemner = None
+        self._coref = None
+    @property
+    def molscribe(self):
+        if self._molscribe is None:
+            self.init_molscribe()
+        return self._molscribe
+    @lru_cache(maxsize=None)
+    def init_molscribe(self, ckpt_path=None):
+        """
+        Set model to custom checkpoint
+        Parameters:
+            ckpt_path: path to checkpoint to use, if None then will use default
+        """
+        if ckpt_path is None:
+            ckpt_path = hf_hub_download("yujieq/MolScribe", "swin_base_char_aux_1m.pth")
+        self._molscribe = MolScribe(ckpt_path, device=self.device)
+    @property
+    def rxnscribe(self):
+        if self._rxnscribe is None:
+            self.init_rxnscribe()
+        return self._rxnscribe
+    @lru_cache(maxsize=None)
+    def init_rxnscribe(self, ckpt_path=None):
+        """
+        Set model to custom checkpoint
+        Parameters:
+            ckpt_path: path to checkpoint to use, if None then will use default
+        """
+        if ckpt_path is None:
+            ckpt_path = hf_hub_download("yujieq/RxnScribe", "pix2seq_reaction_full.ckpt")
+        self._rxnscribe = RxnScribe(ckpt_path, device=self.device)
+    @property
+    def pdfparser(self):
+        if self._pdfparser is None:
+            self.init_pdfparser()
+        return self._pdfparser
+    @lru_cache(maxsize=None)
+    def init_pdfparser(self, ckpt_path=None):
+        """
+        Set model to custom checkpoint
+        Parameters:
+            ckpt_path: path to checkpoint to use, if None then will use default
+        """
+        config_path = "lp://efficientdet/PubLayNet/tf_efficientdet_d1"
+        self._pdfparser = lp.AutoLayoutModel(config_path, model_path=ckpt_path, device=self.device.type)
+    @property
+    def moldet(self):
+        if self._moldet is None:
+            self.init_moldet()
+        return self._moldet
+    @lru_cache(maxsize=None)
+    def init_moldet(self, ckpt_path=None):
+        """
+        Set model to custom checkpoint
+        Parameters:
+            ckpt_path: path to checkpoint to use, if None then will use default
+        """
+        if ckpt_path is None:
+            ckpt_path = hf_hub_download("Ozymandias314/MolDetectCkpt", "best_hf.ckpt")
+        self._moldet = MolDetect(ckpt_path, device=self.device)
+    @property
+    def coref(self):
+        if self._coref is None:
+            self.init_coref()
+        return self._coref
+    @lru_cache(maxsize=None)
+    def init_coref(self, ckpt_path=None):
+        """
+        Set model to custom checkpoint
+        Parameters:
+            ckpt_path: path to checkpoint to use, if None then will use default
+        """
+        if ckpt_path is None:
+            ckpt_path = hf_hub_download("Ozymandias314/MolDetectCkpt", "coref_best_hf.ckpt")
+        self._coref = MolDetect(ckpt_path, device=self.device, coref=True)
+    @property
+    def chemrxnextractor(self):
+        if self._chemrxnextractor is None:
+            self.init_chemrxnextractor()
+        return self._chemrxnextractor
+    @lru_cache(maxsize=None)
+    def init_chemrxnextractor(self, ckpt_path=None):
+        """
+        Set model to custom checkpoint
+        Parameters:
+            ckpt_path: path to checkpoint to use, if None then will use default
+        """
+        if ckpt_path is None:
+            ckpt_path = snapshot_download(repo_id="amberwang/chemrxnextractor-training-modules")
+        self._chemrxnextractor = ChemRxnExtractor("", None, ckpt_path, self.device.type)
+    @property
+    def chemner(self):
+        if self._chemner is None:
+            self.init_chemner()
+        return self._chemner
+    @lru_cache(maxsize=None)
+    def init_chemner(self, ckpt_path=None):
+        """
+        Set model to custom checkpoint
+        Parameters:
+            ckpt_path: path to checkpoint to use, if None then will use default
+        """
+        if ckpt_path is None:
+            ckpt_path = hf_hub_download("Ozymandias314/ChemNERckpt", "best.ckpt")
+        self._chemner = ChemNER(ckpt_path, device=self.device)
+    @property
+    def tableextractor(self):
+        return TableExtractor()
+    def extract_figures_from_pdf(self, pdf, num_pages=None, output_bbox=False, output_image=True):
+        """
+        Find and return all figures from a pdf page
+        Parameters:
+            pdf: path to pdf
+            num_pages: process only first `num_pages` pages, if `None` then process all
+            output_bbox: whether to output bounding boxes for each individual entry of a table
+            output_image: whether to include PIL image for figures. default is True
+        Returns:
+            list of content in the following format
+            [
+                { # first figure
+                    'title': str,
+                    'figure': {
+                        'image': PIL image or None,
+                        'bbox': list in form [x1, y1, x2, y2],
+                    }
+                    'table': {
+                        'bbox': list in form [x1, y1, x2, y2] or empty list,
+                        'content': {
+                            'columns': list of column headers,
+                            'rows': list of list of row content,
+                        } or None
+                    }
+                    'footnote': str or empty,
+                    'page': int
+                }
+                # more figures
+            ]
+        """
+        pages = pdf2image.convert_from_path(pdf, last_page=num_pages)
+        table_ext = self.tableextractor
+        table_ext.set_pdf_file(pdf)
+        table_ext.set_output_image(output_image)
+        table_ext.set_output_bbox(output_bbox)
+        return table_ext.extract_all_tables_and_figures(pages, self.pdfparser, content='figures')
+    def extract_tables_from_pdf(self, pdf, num_pages=None, output_bbox=False, output_image=True):
+        """
+        Find and return all tables from a pdf page
+        Parameters:
+            pdf: path to pdf
+            num_pages: process only first `num_pages` pages, if `None` then process all
+            output_bbox: whether to include bboxes for individual entries of the table
+            output_image: whether to include PIL image for figures. default is True
+        Returns:
+            list of content in the following format
+            [
+                { # first table
+                    'title': str,
+                    'figure': {
+                        'image': PIL image or None,
+                        'bbox': list in form [x1, y1, x2, y2] or empty list,
+                    }
+                    'table': {
+                        'bbox': list in form [x1, y1, x2, y2] or empty list,
+                        'content': {
+                            'columns': list of column headers,
+                            'rows': list of list of row content,
+                        }
+                    }
+                    'footnote': str or empty,
+                    'page': int
+                }
+                # more tables
+            ]
+        """
+        pages = pdf2image.convert_from_path(pdf, last_page=num_pages)
+        table_ext = self.tableextractor
+        table_ext.set_pdf_file(pdf)
+        table_ext.set_output_image(output_image)
+        table_ext.set_output_bbox(output_bbox)
+        return table_ext.extract_all_tables_and_figures(pages, self.pdfparser, content='tables')
+    def extract_molecules_from_figures_in_pdf(self, pdf, batch_size=16, num_pages=None):
+        """
+        Get all molecules and their information from a pdf
+        Parameters:
+            pdf: path to pdf, or byte file
+            batch_size: batch size for inference in all models
+            num_pages: process only first `num_pages` pages, if `None` then process all
+        Returns:
+            list of figures and corresponding molecule info in the following format
+            [
+                {   # first figure
+                    'image': ndarray of the figure image,
+                    'molecules': [
+                        {   # first molecule
+                            'bbox': tuple in the form (x1, y1, x2, y2),
+                            'score': float,
+                            'image': ndarray of cropped molecule image,
+                            'smiles': str,
+                            'molfile': str
+                        },
+                        # more molecules
+                    ],
+                    'page': int
+                },
+                # more figures
+            ]
+        """
+        figures = self.extract_figures_from_pdf(pdf, num_pages=num_pages, output_bbox=True)
+        images = [figure['figure']['image'] for figure in figures]
+        results = self.extract_molecules_from_figures(images, batch_size=batch_size)
+        for figure, result in zip(figures, results):
+            result['page'] = figure['page']
+        return results
+    def extract_molecule_bboxes_from_figures(self, figures, batch_size=16):
+        """
+        Return bounding boxes of molecules in images
+        Parameters:
+            figures: list of PIL or ndarray images
+            batch_size: batch size for inference
+        Returns:
+            list of results for each figure in the following format
+            [
+                [   # first figure
+                    {   # first bounding box
+                        'category': str,
+                        'bbox': tuple in the form (x1, y1, x2, y2),
+                        'category_id': int,
+                        'score': float
+                    },
+                    # more bounding boxes
+                ],
+                # more figures
+            ]
+        """
+        figures = [convert_to_pil(figure) for figure in figures]
+        return self.moldet.predict_images(figures, batch_size=batch_size)
+    def extract_molecules_from_figures(self, figures, batch_size=16):
+        """
+        Get all molecules and their information from list of figures
+        Parameters:
+            figures: list of PIL or ndarray images
+            batch_size: batch size for inference
+        Returns:
+            list of results for each figure in the following format
+            [
+                {   # first figure
+                    'image': ndarray of the figure image,
+                    'molecules': [
+                        {   # first molecule
+                            'bbox': tuple in the form (x1, y1, x2, y2),
+                            'score': float,
+                            'image': ndarray of cropped molecule image,
+                            'smiles': str,
+                            'molfile': str
+                        },
+                        # more molecules
+                    ],
+                },
+                # more figures
+            ]
+        """
+        bboxes = self.extract_molecule_bboxes_from_figures(figures, batch_size=batch_size)
+        figures = [convert_to_cv2(figure) for figure in figures]
+        results, cropped_images, refs = clean_bbox_output(figures, bboxes)
+        mol_info = self.molscribe.predict_images(cropped_images, batch_size=batch_size)
+        for info, ref in zip(mol_info, refs):
+            ref.update(info)
+        return results
+    def extract_molecule_corefs_from_figures_in_pdf(self, pdf, batch_size=16, num_pages=None, molscribe = True, ocr = True):
+        """
+        Get all molecule bboxes and corefs from figures in pdf
+        Parameters:
+            pdf: path to pdf, or byte file
+            batch_size: batch size for inference in all models
+            num_pages: process only first `num_pages` pages, if `None` then process all
+        Returns:
+            list of results for each figure in the following format:
+            [
+                {
+                    'bboxes': [
+                        {   # first bbox
+                            'category': '[Sup]',
+                            'bbox': (0.0050025012506253125, 0.38273870663142223, 0.9934967483741871, 0.9450094869920168),
+                            'category_id': 4,
+                            'score': -0.07593922317028046
+                        },
+                        # More bounding boxes
+                    ],
+                    'corefs': [
+                        [0, 1],  # molecule bbox index, identifier bbox index
+                        [3, 4],
+                        # More coref pairs
+                    ],
+                    'page': int
+                },
+                # More figures
+            ]
+        """
+        figures = self.extract_figures_from_pdf(pdf, num_pages=num_pages, output_bbox=True)
+        images = [figure['figure']['image'] for figure in figures]
+        results = self.extract_molecule_corefs_from_figures(images, batch_size=batch_size, molscribe=molscribe, ocr=ocr)
+        for figure, result in zip(figures, results):
+            result['page'] = figure['page']
+        return results
+    def extract_molecule_corefs_from_figures(self, figures, batch_size=16, molscribe=True, ocr=True):
+        """
+        Get all molecule bboxes and corefs from list of figures
+        Parameters:
+            figures: list of PIL or ndarray images
+            batch_size: batch size for inference
+        Returns:
+            list of results for each figure in the following format:
+            [
+                {
+                    'bboxes': [
+                        {   # first bbox
+                            'category': '[Sup]',
+                            'bbox': (0.0050025012506253125, 0.38273870663142223, 0.9934967483741871, 0.9450094869920168),
+                            'category_id': 4,
+                            'score': -0.07593922317028046
+                        },
+                        # More bounding boxes
+                    ],
+                    'corefs': [
+                        [0, 1],  # molecule bbox index, identifier bbox index
+                        [3, 4],
+                        # More coref pairs
+                    ],
+                },
+                # More figures
+            ]
+        """
+        figures = [convert_to_pil(figure) for figure in figures]
+        return self.coref.predict_images(figures, batch_size=batch_size, coref=True, molscribe = molscribe, ocr = ocr)
+    def extract_reactions_from_figures_in_pdf(self, pdf, batch_size=16, num_pages=None, molscribe=True, ocr=True):
+        """
+        Get reaction information from figures in pdf
+        Parameters:
+            pdf: path to pdf, or byte file
+            batch_size: batch size for inference in all models
+            num_pages: process only first `num_pages` pages, if `None` then process all
+            molscribe: whether to predict and return smiles and molfile info
+            ocr: whether to predict and return text of conditions
+        Returns:
+            list of figures and corresponding molecule info in the following format
+            [
+                {
+                    'figure': PIL image
+                    'reactions': [
+                        {
+                            'reactants': [
+                                {
+                                    'category': str,
+                                    'bbox': tuple (x1,x2,y1,y2),
+                                    'category_id': int,
+                                    'smiles': str,
+                                    'molfile': str,
+                                },
+                                # more reactants
+                            ],
+                            'conditions': [
+                                {
+                                    'category': str,
+                                    'bbox': tuple (x1,x2,y1,y2),
+                                    'category_id': int,
+                                    'text': list of str,
+                                },
+                                # more conditions
+                            ],
+                            'products': [
+                                # same structure as reactants
+                            ]
+                        },
+                        # more reactions
+                    ],
+                    'page': int
+                },
+                # more figures
+            ]
+        """
+        figures = self.extract_figures_from_pdf(pdf, num_pages=num_pages, output_bbox=True)
+        images = [figure['figure']['image'] for figure in figures]
+        results = self.extract_reactions_from_figures(images, batch_size=batch_size, molscribe=molscribe, ocr=ocr)
+        for figure, result in zip(figures, results):
+            result['page'] = figure['page']
+        return results
+    def extract_reactions_from_figures(self, figures, batch_size=16, molscribe=True, ocr=True):
+        """
+        Get reaction information from list of figures
+        Parameters:
+            figures: list of PIL or ndarray images
+            batch_size: batch size for inference in all models
+            molscribe: whether to predict and return smiles and molfile info
+            ocr: whether to predict and return text of conditions
+        Returns:
+            list of figures and corresponding molecule info in the following format
+            [
+                {
+                    'figure': PIL image
+                    'reactions': [
+                        {
+                            'reactants': [
+                                {
+                                    'category': str,
+                                    'bbox': tuple (x1,x2,y1,y2),
+                                    'category_id': int,
+                                    'smiles': str,
+                                    'molfile': str,
+                                },
+                                # more reactants
+                            ],
+                            'conditions': [
+                                {
+                                    'category': str,
+                                    'bbox': tuple (x1,x2,y1,y2),
+                                    'category_id': int,
+                                    'text': list of str,
+                                },
+                                # more conditions
+                            ],
+                            'products': [
+                                # same structure as reactants
+                            ]
+                        },
+                        # more reactions
+                    ],
+                },
+                # more figures
+            ]
+        """
+        pil_figures = [convert_to_pil(figure) for figure in figures]
+        results = []
+        reactions = self.rxnscribe.predict_images(pil_figures, batch_size=batch_size, molscribe=molscribe, ocr=ocr)
+        for figure, rxn in zip(figures, reactions):
+            data = {
+                'figure': figure,
+                'reactions': rxn,
+                }
+            results.append(data)
+        return results
+    def extract_molecules_from_text_in_pdf(self, pdf, batch_size=16, num_pages=None):
+        """
+        Get molecules in text of given pdf
+        Parameters:
+            pdf: path to pdf, or byte file
+            batch_size: batch size for inference in all models
+            num_pages: process only first `num_pages` pages, if `None` then process all
+        Returns:
+            list of sentences and found molecules in the following format
+            [
+                {
+                    'molecules': [
+                        { # first paragraph
+                            'text': str,
+                            'labels': [
+                                (str, int, int), # tuple of label, range start (inclusive), range end (exclusive)
+                                # more labels
+                            ]
+                        },
+                        # more paragraphs
+                    ]
+                    'page': int
+                },
+                # more pages
+            ]
+        """
+        self.chemrxnextractor.set_pdf_file(pdf)
+        self.chemrxnextractor.set_pages(num_pages)
+        text = self.chemrxnextractor.get_paragraphs_from_pdf(num_pages)
+        result = []
+        for data in text:
+            model_inp = []
+            for paragraph in data['paragraphs']:
+                model_inp.append(' '.join(paragraph).replace('\n', ''))
+            output = self.chemner.predict_strings(model_inp, batch_size=batch_size)
+            to_add = {
+                'molecules': [{
+                    'text': t,
+                    'labels': labels,
+                    } for t, labels in zip(model_inp, output)],
+                'page': data['page']
+            }
+            result.append(to_add)
+        return result
+    def extract_reactions_from_text_in_pdf(self, pdf, num_pages=None):
+        """
+        Get reaction information from text in pdf
+        Parameters:
+            pdf: path to pdf
+            num_pages: process only first `num_pages` pages, if `None` then process all
+        Returns:
+            list of pages and corresponding reaction info in the following format
+            [
+                {
+                    'page': page number
+                    'reactions': [
+                        {
+                            'tokens': list of words in relevant sentence,
+                            'reactions' : [
+                                {
+                                    # key, value pairs where key is the label and value is a tuple
+                                    # or list of tuples of the form (tokens, start index, end index)
+                                    # where indices are for the corresponding token list and start and end are inclusive
+                                }
+                                # more reactions
+                            ]
+                        }
+                        # more reactions in other sentences
+                    ]
+                },
+                # more pages
+            ]
+        """
+        self.chemrxnextractor.set_pdf_file(pdf)
+        self.chemrxnextractor.set_pages(num_pages)
+        return self.chemrxnextractor.extract_reactions_from_text()
+    def extract_reactions_from_text_in_pdf_combined(self, pdf, num_pages=None):
+        """
+        Get reaction information from text in pdf and combined with corefs from figures
+        Parameters:
+            pdf: path to pdf
+            num_pages: process only first `num_pages` pages, if `None` then process all
+        Returns:
+            list of pages and corresponding reaction info in the following format
+            [
+                {
+                    'page': page number
+                    'reactions': [
+                        {
+                            'tokens': list of words in relevant sentence,
+                            'reactions' : [
+                                {
+                                    # key, value pairs where key is the label and value is a tuple
+                                    # or list of tuples of the form (tokens, start index, end index)
+                                    # where indices are for the corresponding token list and start and end are inclusive
+                                }
+                                # more reactions
+                            ]
+                        }
+                        # more reactions in other sentences
+                    ]
+                },
+                # more pages
+            ]
+        """
+        results = self.extract_reactions_from_text_in_pdf(pdf, num_pages=num_pages)
+        results_coref = self.extract_molecule_corefs_from_figures_in_pdf(pdf, num_pages=num_pages)
+        return associate_corefs(results, results_coref)
+    def extract_reactions_from_figures_and_tables_in_pdf(self, pdf, num_pages=None, batch_size=16, molscribe=True, ocr=True):
+        """
+        Get reaction information from figures and combine with table information in pdf
+        Parameters:
+            pdf: path to pdf, or byte file
+            batch_size: batch size for inference in all models
+            num_pages: process only first `num_pages` pages, if `None` then process all
+            molscribe: whether to predict and return smiles and molfile info
+            ocr: whether to predict and return text of conditions
+        Returns:
+            list of figures and corresponding molecule info in the following format
+            [
+                {
+                    'figure': PIL image
+                    'reactions': [
+                        {
+                            'reactants': [
+                                {
+                                    'category': str,
+                                    'bbox': tuple (x1,x2,y1,y2),
+                                    'category_id': int,
+                                    'smiles': str,
+                                    'molfile': str,
+                                },
+                                # more reactants
+                            ],
+                            'conditions': [
+                                {
+                                    'category': str,
+                                    'text': list of str,
+                                },
+                                # more conditions
+                            ],
+                            'products': [
+                                # same structure as reactants
+                            ]
+                        },
+                        # more reactions
+                    ],
+                    'page': int
+                },
+                # more figures
+            ]
+        """
+        figures = self.extract_figures_from_pdf(pdf, num_pages=num_pages, output_bbox=True)
+        images = [figure['figure']['image'] for figure in figures]
+        results = self.extract_reactions_from_figures(images, batch_size=batch_size, molscribe=molscribe, ocr=ocr)
+        results = process_tables(figures, results, self.molscribe, batch_size=batch_size)
+        results_coref = self.extract_molecule_corefs_from_figures_in_pdf(pdf, num_pages=num_pages)
+        results = replace_rgroups_in_figure(figures, results, results_coref, self.molscribe, batch_size=batch_size)
+        results = expand_reactions_with_backout(results, results_coref, self.molscribe)
+        return results
+    def extract_reactions_from_pdf(self, pdf, num_pages=None, batch_size=16):
+        """
+        Returns:
+            dictionary of reactions from multimodal sources
+            {
+                'figures': [
+                    {
+                        'figure': PIL image
+                        'reactions': [
+                            {
+                                'reactants': [
+                                    {
+                                        'category': str,
+                                        'bbox': tuple (x1,x2,y1,y2),
+                                        'category_id': int,
+                                        'smiles': str,
+                                        'molfile': str,
+                                    },
+                                    # more reactants
+                                ],
+                                'conditions': [
+                                    {
+                                        'category': str,
+                                        'text': list of str,
+                                    },
+                                    # more conditions
+                                ],
+                                'products': [
+                                    # same structure as reactants
+                                ]
+                            },
+                            # more reactions
+                        ],
+                        'page': int
+                    },
+                    # more figures
+                ]
+                'text': [
+                    {
+                        'page': page number
+                        'reactions': [
+                            {
+                                'tokens': list of words in relevant sentence,
+                                'reactions' : [
+                                    {
+                                        # key, value pairs where key is the label and value is a tuple
+                                        # or list of tuples of the form (tokens, start index, end index)
+                                        # where indices are for the corresponding token list and start and end are inclusive
+                                    }
+                                    # more reactions
+                                ]
+                            }
+                            # more reactions in other sentences
+                        ]
+                    },
+                    # more pages
+                ]
+            }
+        """
+        figures = self.extract_figures_from_pdf(pdf, num_pages=num_pages, output_bbox=True)
+        images = [figure['figure']['image'] for figure in figures]
+        results = self.extract_reactions_from_figures(images, batch_size=batch_size, molscribe=True, ocr=True)
+        table_expanded_results = process_tables(figures, results, self.molscribe, batch_size=batch_size)
+        text_results = self.extract_reactions_from_text_in_pdf(pdf, num_pages=num_pages)
+        results_coref = self.extract_molecule_corefs_from_figures_in_pdf(pdf, num_pages=num_pages)
+        figure_results = replace_rgroups_in_figure(figures, table_expanded_results, results_coref, self.molscribe, batch_size=batch_size)
+        table_expanded_results = expand_reactions_with_backout(figure_results, results_coref, self.molscribe)
+        coref_expanded_results = associate_corefs(text_results, results_coref)
+        return {
+            'figures': table_expanded_results,
+            'text': coref_expanded_results,
+        }
+if __name__=="__main__":
+    model = OpenChemIE()

chemietoolkit/tableextractor.py ADDED Viewed

	@@ -0,0 +1,340 @@

+import pdf2image
+import numpy as np
+from PIL import Image
+import matplotlib.pyplot as plt
+import layoutparser as lp
+import cv2
+from PyPDF2 import PdfReader, PdfWriter
+import pandas as pd
+import pdfminer.high_level
+import pdfminer.layout
+from operator import itemgetter
+# inputs: pdf_file, page #, bounding box (optional) (llur or ullr), output_bbox
+class TableExtractor(object):
+    def __init__(self, output_bbox=True):
+        self.pdf_file = ""
+        self.page = ""
+        self.image_dpi = 200
+        self.pdf_dpi = 72
+        self.output_bbox = output_bbox
+        self.blocks = {}
+        self.title_y = 0
+        self.column_header_y = 0
+        self.model = None
+        self.img = None
+        self.output_image = True
+        self.tagging = {
+            'substance': ['compound', 'salt', 'base', 'solvent', 'CBr4', 'collidine', 'InX3', 'substrate', 'ligand', 'PPh3', 'PdL2', 'Cu', 'compd', 'reagent', 'reagant', 'acid', 'aldehyde', 'amine', 'Ln', 'H2O', 'enzyme', 'cofactor', 'oxidant', 'Pt(COD)Cl2', 'CuBr2', 'additive'],
+            'ratio': [':'],
+            'measurement': ['μM', 'nM', 'IC50', 'CI', 'excitation', 'emission', 'Φ', 'φ', 'shift', 'ee', 'ΔG', 'ΔH', 'TΔS', 'Δ', 'distance', 'trajectory', 'V', 'eV'],
+            'temperature': ['temp', 'temperature', 'T', '°C'],
+            'time': ['time', 't(', 't ('],
+            'result': ['yield', 'aa', 'result', 'product', 'conversion', '(%)'],
+            'alkyl group': ['R', 'Ar', 'X', 'Y'],
+            'solvent': ['solvent'],
+            'counter': ['entry', 'no.'],
+            'catalyst': ['catalyst', 'cat.'],
+            'conditions': ['condition'],
+            'reactant': ['reactant'],
+        }
+    def set_output_image(self, oi):
+        self.output_image = oi
+    def set_pdf_file(self, pdf):
+        self.pdf_file = pdf
+    def set_page_num(self, pn):
+        self.page = pn
+    def set_output_bbox(self, ob):
+        self.output_bbox = ob
+    def run_model(self, page_info):
+        #img = np.asarray(pdf2image.convert_from_path(self.pdf_file, dpi=self.image_dpi)[self.page])
+        #model = lp.Detectron2LayoutModel('lp://PubLayNet/mask_rcnn_X_101_32x8d_FPN_3x/config', extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.5], label_map={0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"})
+        img = np.asarray(page_info)
+        self.img = img
+        layout_result = self.model.detect(img)
+        text_blocks = lp.Layout([b for b in layout_result if b.type == 'Text'])
+        title_blocks = lp.Layout([b for b in layout_result if b.type == 'Title'])
+        list_blocks = lp.Layout([b for b in layout_result if b.type == 'List'])
+        table_blocks = lp.Layout([b for b in layout_result if b.type == 'Table'])
+        figure_blocks = lp.Layout([b for b in layout_result if b.type == 'Figure'])
+        self.blocks.update({'text': text_blocks})
+        self.blocks.update({'title': title_blocks})
+        self.blocks.update({'list': list_blocks})
+        self.blocks.update({'table': table_blocks})
+        self.blocks.update({'figure': figure_blocks})
+    # type is what coordinates you want to get. it comes in text, title, list, table, and figure
+    def convert_to_pdf_coordinates(self, type):
+        # scale coordinates
+        blocks = self.blocks[type]
+        coordinates =  [blocks[a].scale(self.pdf_dpi/self.image_dpi) for a in range(len(blocks))]
+        reader = PdfReader(self.pdf_file)
+        writer = PdfWriter()
+        p = reader.pages[self.page]
+        a = p.mediabox.upper_left
+        new_coords = []
+        for new_block in coordinates:
+            new_coords.append((new_block.block.x_1, pd.to_numeric(a[1]) - new_block.block.y_2, new_block.block.x_2, pd.to_numeric(a[1]) - new_block.block.y_1))
+        return new_coords
+    # output: list of bounding boxes for tables but in pdf coordinates
+    # input: new_coords is singular table bounding box in pdf coordinates
+    def extract_singular_table(self, new_coords):
+        for page_layout in pdfminer.high_level.extract_pages(self.pdf_file, page_numbers=[self.page]):
+            elements = []
+            for element in page_layout:
+                if isinstance(element, pdfminer.layout.LTTextBox):
+                    for e in element._objs:
+                        temp = e.bbox
+                        if temp[0] > min(new_coords[0], new_coords[2]) and temp[0] < max(new_coords[0], new_coords[2]) and temp[1] > min(new_coords[1], new_coords[3]) and temp[1] < max(new_coords[1], new_coords[3]) and temp[2] > min(new_coords[0], new_coords[2]) and temp[2] < max(new_coords[0], new_coords[2]) and temp[3] > min(new_coords[1], new_coords[3]) and temp[3] < max(new_coords[1], new_coords[3]) and isinstance(e, pdfminer.layout.LTTextLineHorizontal):
+                            elements.append([e.bbox[0], e.bbox[1], e.bbox[2], e.bbox[3], e.get_text()])
+            elements = sorted(elements, key=itemgetter(0))
+            w = sorted(elements, key=itemgetter(3), reverse=True)
+            if len(w) <= 1:
+                continue
+            ret = {}
+            i = 1
+            g = [w[0]]
+            while i < len(w) and w[i][3] > w[i-1][1]:
+                g.append(w[i])
+                i += 1
+            g = sorted(g, key=itemgetter(0))
+            # check for overlaps
+            for a in range(len(g)-1, 0, -1):
+                if g[a][0] < g[a-1][2]:
+                    g[a-1][0] = min(g[a][0], g[a-1][0])
+                    g[a-1][1] = min(g[a][1], g[a-1][1])
+                    g[a-1][2] = max(g[a][2], g[a-1][2])
+                    g[a-1][3] = max(g[a][3], g[a-1][3])
+                    g[a-1][4] = g[a-1][4].strip() + " " + g[a][4]
+                    g.pop(a)
+            ret.update({"columns":[]})
+            for t in g:
+                temp_bbox = t[:4]
+                column_text = t[4].strip()
+                tag = 'unknown'
+                tagged = False
+                for key in self.tagging.keys():
+                    for word in self.tagging[key]:
+                        if word in column_text:
+                            tag = key
+                            tagged = True
+                            break
+                    if tagged:
+                        break
+                if self.output_bbox:
+                    ret["columns"].append({'text':column_text,'tag': tag, 'bbox':temp_bbox})
+                else:
+                    ret["columns"].append({'text':column_text,'tag': tag})
+                self.column_header_y = max(t[1], t[3])
+            ret.update({"rows":[]})
+            g.insert(0, [0, 0, new_coords[0], 0, ''])
+            g.append([new_coords[2], 0, 0, 0, ''])
+            while i < len(w):
+                group = [w[i]]
+                i += 1
+                while i < len(w) and w[i][3] > w[i-1][1]:
+                    group.append(w[i])
+                    i += 1
+                group = sorted(group, key=itemgetter(0))
+                for a in range(len(group)-1, 0, -1):
+                    if group[a][0] < group[a-1][2]:
+                        group[a-1][0] = min(group[a][0], group[a-1][0])
+                        group[a-1][1] = min(group[a][1], group[a-1][1])
+                        group[a-1][2] = max(group[a][2], group[a-1][2])
+                        group[a-1][3] = max(group[a][3], group[a-1][3])
+                        group[a-1][4] = group[a-1][4].strip() + " " + group[a][4]
+                        group.pop(a)
+                a = 1
+                while a < len(g) - 1:
+                    if a > len(group):
+                        group.append([0, 0, 0, 0, '\n'])
+                        a += 1
+                        continue
+                    if group[a-1][0] >= g[a-1][2] and group[a-1][2] <= g[a+1][0]:
+                        pass
+                        """
+                        if a < len(group) and group[a][0] >= g[a-1][2] and group[a][2] <= g[a+1][0]:
+                            g.insert(1, [g[0][2], 0, group[a-1][2], 0, ''])
+                            #ret["columns"].insert(0, '')
+                        else:
+                            a += 1
+                            continue
+                        """
+                    else: group.insert(a-1, [0, 0, 0, 0, '\n'])
+                    a += 1
+                added_row = []
+                for t in group:
+                    temp_bbox = t[:4]
+                    if self.output_bbox:
+                        added_row.append({'text':t[4].strip(), 'bbox':temp_bbox})
+                    else:
+                        added_row.append(t[4].strip())
+                ret["rows"].append(added_row)
+            if ret["rows"] and len(ret["rows"][0]) != len(ret["columns"]):
+                ret["columns"] = ret["rows"][0]
+                ret["rows"] = ret["rows"][1:]
+                for col in ret['columns']:
+                    tag = 'unknown'
+                    tagged = False
+                    for key in self.tagging.keys():
+                        for word in self.tagging[key]:
+                            if word in col['text']:
+                                tag = key
+                                tagged = True
+                                break
+                        if tagged:
+                            break
+                    col['tag'] = tag
+            return ret
+    def get_title_and_footnotes(self, tb_coords):
+        for page_layout in pdfminer.high_level.extract_pages(self.pdf_file, page_numbers=[self.page]):
+            title = (0, 0, 0, 0, '')
+            footnote = (0, 0, 0, 0, '')
+            title_gap = 30
+            footnote_gap = 30
+            for element in page_layout:
+                if isinstance(element, pdfminer.layout.LTTextBoxHorizontal):
+                    if (element.bbox[0] >= tb_coords[0] and element.bbox[0] <= tb_coords[2]) or (element.bbox[2] >= tb_coords[0] and element.bbox[2] <= tb_coords[2]) or (tb_coords[0] >= element.bbox[0] and tb_coords[0] <= element.bbox[2]) or (tb_coords[2] >= element.bbox[0] and tb_coords[2] <= element.bbox[2]):
+                        #print(element)
+                        if 'Table' in element.get_text():
+                            if abs(element.bbox[1] - tb_coords[3]) < title_gap:
+                                title = tuple(element.bbox) + (element.get_text()[element.get_text().index('Table'):].replace('\n', ' '),)
+                                title_gap = abs(element.bbox[1] - tb_coords[3])
+                        if 'Scheme' in element.get_text():
+                            if abs(element.bbox[1] - tb_coords[3]) < title_gap:
+                                title = tuple(element.bbox) + (element.get_text()[element.get_text().index('Scheme'):].replace('\n', ' '),)
+                                title_gap = abs(element.bbox[1] - tb_coords[3])
+                        if element.bbox[1] >= tb_coords[1] and element.bbox[3] <= tb_coords[3]: continue
+                        #print(element)
+                        temp = ['aA', 'aB', 'aC', 'aD', 'aE', 'aF', 'aG', 'aH', 'aI', 'aJ', 'aK', 'aL', 'aM', 'aN', 'aO', 'aP', 'aQ', 'aR', 'aS', 'aT', 'aU', 'aV', 'aW', 'aX', 'aY', 'aZ', 'a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'a7', 'a8', 'a9', 'a0']
+                        for segment in temp:
+                            if segment in element.get_text():
+                                if abs(element.bbox[3] - tb_coords[1]) < footnote_gap:
+                                    footnote = tuple(element.bbox) + (element.get_text()[element.get_text().index(segment):].replace('\n', ' '),)
+                                    footnote_gap = abs(element.bbox[3] - tb_coords[1])
+                                break
+            self.title_y = min(title[1], title[3])
+            if self.output_bbox:
+                return ({'text': title[4], 'bbox': list(title[:4])}, {'text': footnote[4], 'bbox': list(footnote[:4])})
+            else:
+                return (title[4], footnote[4])
+    def extract_table_information(self):
+        #self.run_model(page_info) # changed
+        table_coordinates = self.blocks['table'] #should return a list of layout objects
+        table_coordinates_in_pdf = self.convert_to_pdf_coordinates('table') #should return a list of lists
+        ans = []
+        i = 0
+        for coordinate in table_coordinates_in_pdf:
+            ret = {}
+            pad = 20
+            coordinate = [coordinate[0] - pad, coordinate[1], coordinate[2] + pad, coordinate[3]]
+            ullr_coord = [coordinate[0], coordinate[3], coordinate[2], coordinate[1]]
+            table_results = self.extract_singular_table(coordinate)
+            tf = self.get_title_and_footnotes(coordinate)
+            figure = Image.fromarray(table_coordinates[i].crop_image(self.img))
+            ret.update({'title': tf[0]})
+            ret.update({'figure': {
+                'image': None,
+                'bbox': []
+                       }})
+            if self.output_image:
+                ret['figure']['image'] = figure
+            ret.update({'table': {'bbox': list(coordinate), 'content': table_results}})
+            ret.update({'footnote': tf[1]})
+            if abs(self.title_y - self.column_header_y) > 50:
+                ret['figure']['bbox'] = list(coordinate)
+            ret.update({'page':self.page})
+            ans.append(ret)
+            i += 1
+        return ans
+    def extract_figure_information(self):
+        figure_coordinates = self.blocks['figure']
+        figure_coordinates_in_pdf = self.convert_to_pdf_coordinates('figure')
+        ans = []
+        for i in range(len(figure_coordinates)):
+            ret = {}
+            coordinate = figure_coordinates_in_pdf[i]
+            ullr_coord = [coordinate[0], coordinate[3], coordinate[2], coordinate[1]]
+            tf = self.get_title_and_footnotes(coordinate)
+            figure = Image.fromarray(figure_coordinates[i].crop_image(self.img))
+            ret.update({'title':tf[0]})
+            ret.update({'figure': {
+                'image': None,
+                'bbox': []
+                       }})
+            if self.output_image:
+                ret['figure']['image'] = figure
+            ret.update({'table': {
+                'bbox': [],
+                'content': None
+                       }})
+            ret.update({'footnote': tf[1]})
+            ret['figure']['bbox'] = list(coordinate)
+            ret.update({'page':self.page})
+            ans.append(ret)
+        return ans
+    def extract_all_tables_and_figures(self, pages, pdfparser, content=None):
+        self.model = pdfparser
+        ret = []
+        for i in range(len(pages)):
+            self.set_page_num(i)
+            self.run_model(pages[i])
+            table_info = self.extract_table_information()
+            figure_info = self.extract_figure_information()
+            if content == 'tables':
+                ret += table_info
+            elif content == 'figures':
+                ret += figure_info
+                for table in table_info:
+                    if table['figure']['bbox'] != []:
+                        ret.append(table)
+            else:
+                ret += table_info
+                ret += figure_info
+        return ret

chemietoolkit/utils.py ADDED Viewed

	@@ -0,0 +1,1018 @@

+import numpy as np
+from PIL import Image
+import cv2
+import layoutparser as lp
+from rdkit import Chem
+from rdkit.Chem import Draw
+from rdkit.Chem import rdDepictor
+rdDepictor.SetPreferCoordGen(True)
+from rdkit.Chem.Draw import IPythonConsole
+from rdkit.Chem import AllChem
+import re
+import copy
+BOND_TO_INT = {
+    "": 0,
+    "single": 1,
+    "double": 2,
+    "triple": 3,
+    "aromatic": 4,
+    "solid wedge": 5,
+    "dashed wedge": 6
+}
+RGROUP_SYMBOLS = ['R', 'R1', 'R2', 'R3', 'R4', 'R5', 'R6', 'R7', 'R8', 'R9', 'R10', 'R11', 'R12',
+                  'Ra', 'Rb', 'Rc', 'Rd', 'Rf', 'X', 'Y', 'Z', 'Q', 'A', 'E', 'Ar', 'Ar1', 'Ar2', 'Ari', "R'",
+                  '1*', '2*','3*', '4*','5*', '6*','7*', '8*','9*', '10*','11*', '12*','[a*]', '[b*]','[c*]', '[d*]']
+RGROUP_SYMBOLS = RGROUP_SYMBOLS + [f'[{i}]' for i in RGROUP_SYMBOLS]
+RGROUP_SMILES = ['[1*]', '[2*]','[3*]', '[4*]','[5*]', '[6*]','[7*]', '[8*]','[9*]', '[10*]','[11*]', '[12*]','[a*]', '[b*]','[c*]', '[d*]','*', '[Rf]']
+def get_figures_from_pages(pages, pdfparser):
+    figures = []
+    for i in range(len(pages)):
+        img = np.asarray(pages[i])
+        layout = pdfparser.detect(img)
+        blocks = lp.Layout([b for b in layout if b.type == "Figure"])
+        for block in blocks:
+            figure = Image.fromarray(block.crop_image(img))
+            figures.append({
+                'image': figure,
+                'page': i
+            })
+    return figures
+def clean_bbox_output(figures, bboxes):
+    results = []
+    cropped = []
+    references = []
+    for i, output in enumerate(bboxes):
+        mol_bboxes = [elt['bbox'] for elt in output if elt['category'] == '[Mol]']
+        mol_scores = [elt['score'] for elt in output if elt['category'] == '[Mol]']
+        data = {}
+        results.append(data)
+        data['image'] = figures[i]
+        data['molecules'] = []
+        for bbox, score in zip(mol_bboxes, mol_scores):
+            x1, y1, x2, y2 = bbox
+            height, width, _ = figures[i].shape
+            cropped_img = figures[i][int(y1*height):int(y2*height),int(x1*width):int(x2*width)]
+            cur_mol = {
+                'bbox': bbox,
+                'score': score,
+                'image': cropped_img,
+                #'info': None,
+            }
+            cropped.append(cropped_img)
+            data['molecules'].append(cur_mol)
+            references.append(cur_mol)
+    return results, cropped, references
+def convert_to_pil(image):
+    if type(image) == np.ndarray:
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        image = Image.fromarray(image)
+    return image
+def convert_to_cv2(image):
+    if type(image) != np.ndarray:
+        image = cv2.cvtColor(np.array(image), cv2.COLOR_BGR2RGB)
+    return image
+def replace_rgroups_in_figure(figures, results, coref_results, molscribe, batch_size=16):
+    pattern = re.compile('(?P<name>[RXY]\d?)[ ]*=[ ]*(?P<group>\w+)')
+    for figure, result, corefs in zip(figures, results, coref_results):
+        r_groups = []
+        seen_r_groups = set()
+        for bbox in corefs['bboxes']:
+            if bbox['category'] == '[Idt]':
+                for text in bbox['text']:
+                    res = pattern.search(text)
+                    if res is None:
+                        continue
+                    name = res.group('name')
+                    group = res.group('group')
+                    if (name, group) in seen_r_groups:
+                        continue
+                    seen_r_groups.add((name, group))
+                    r_groups.append({name: res.group('group')})
+        if r_groups and result['reactions']:
+            seen_r_groups = set([pair[0] for pair in seen_r_groups])
+            orig_reaction = result['reactions'][0]
+            graphs = get_atoms_and_bonds(figure['figure']['image'], orig_reaction, molscribe, batch_size=batch_size)
+            relevant_locs = {}
+            for i, graph in enumerate(graphs):
+                to_add = []
+                for j, atom in enumerate(graph['chartok_coords']['symbols']):
+                    if atom[1:-1] in seen_r_groups:
+                        to_add.append((atom[1:-1], j))
+                relevant_locs[i] = to_add
+            for r_group in r_groups:
+                reaction = get_replaced_reaction(orig_reaction, graphs, relevant_locs, r_group, molscribe)
+                to_add ={
+                    'reactants': reaction['reactants'][:],
+                    'conditions': orig_reaction['conditions'][:],
+                    'products': reaction['products'][:]
+                }
+                result['reactions'].append(to_add)
+    return results
+def process_tables(figures, results, molscribe, batch_size=16):
+    r_group_pattern = re.compile(r'^(\w+-)?(?P<group>[\w-]+)( \(\w+\))?$')
+    for figure, result in zip(figures, results):
+        result['page'] = figure['page']
+        if figure['table']['content'] is not None:
+            content = figure['table']['content']
+            if len(result['reactions']) > 1:
+                print("Warning: multiple reactions detected for table")
+            elif len(result['reactions']) == 0:
+                continue
+            orig_reaction = result['reactions'][0]
+            graphs = get_atoms_and_bonds(figure['figure']['image'], orig_reaction, molscribe, batch_size=batch_size)
+            relevant_locs = find_relevant_groups(graphs, content['columns'])
+            conditions_to_extend = []
+            for row in content['rows']:
+                r_groups = {}
+                expanded_conditions = orig_reaction['conditions'][:]
+                replaced = False
+                for col, entry in zip(content['columns'], row):
+                    if col['tag'] != 'alkyl group':
+                        expanded_conditions.append({
+                            'category': '[Table]',
+                            'text': entry['text'],
+                            'tag': col['tag'],
+                            'header': col['text'],
+                        })
+                    else:
+                        found = r_group_pattern.match(entry['text'])
+                        if found is not None:
+                            r_groups[col['text']] = found.group('group')
+                            replaced = True
+                reaction = get_replaced_reaction(orig_reaction, graphs, relevant_locs, r_groups, molscribe)
+                if replaced:
+                    to_add = {
+                        'reactants': reaction['reactants'][:],
+                        'conditions': expanded_conditions,
+                        'products': reaction['products'][:]
+                    }
+                    result['reactions'].append(to_add)
+                else:
+                    conditions_to_extend.append(expanded_conditions)
+            orig_reaction['conditions'] = [orig_reaction['conditions']]
+            orig_reaction['conditions'].extend(conditions_to_extend)
+    return results
+def get_atoms_and_bonds(image, reaction, molscribe, batch_size=16):
+    image = convert_to_cv2(image)
+    cropped_images = []
+    results = []
+    for key, molecules in reaction.items():
+        for i, elt in enumerate(molecules):
+            if type(elt) != dict or elt['category'] != '[Mol]':
+                continue
+            x1, y1, x2, y2 = elt['bbox']
+            height, width, _ = image.shape
+            cropped_images.append(image[int(y1*height):int(y2*height),int(x1*width):int(x2*width)])
+            to_add = {
+                'image': cropped_images[-1],
+                'chartok_coords': {
+                    'coords': [],
+                    'symbols': [],
+                },
+                'edges': [],
+                'key': (key, i)
+            }
+            results.append(to_add)
+    outputs = molscribe.predict_images(cropped_images, return_atoms_bonds=True, batch_size=batch_size)
+    for mol, result in zip(outputs, results):
+        for atom in mol['atoms']:
+            result['chartok_coords']['coords'].append((atom['x'], atom['y']))
+            result['chartok_coords']['symbols'].append(atom['atom_symbol'])
+        result['edges'] = [[0] * len(mol['atoms']) for _ in range(len(mol['atoms']))]
+        for bond in mol['bonds']:
+            i, j = bond['endpoint_atoms']
+            result['edges'][i][j] = BOND_TO_INT[bond['bond_type']]
+            result['edges'][j][i] = BOND_TO_INT[bond['bond_type']]
+    return results
+def find_relevant_groups(graphs, columns):
+    results = {}
+    r_groups = set([f"[{col['text']}]" for col in columns if col['tag'] == 'alkyl group'])
+    for i, graph in enumerate(graphs):
+        to_add = []
+        for j, atom in enumerate(graph['chartok_coords']['symbols']):
+            if atom in r_groups:
+                to_add.append((atom[1:-1], j))
+        results[i] = to_add
+    return results
+def get_replaced_reaction(orig_reaction, graphs, relevant_locs, mappings, molscribe):
+    graph_copy = []
+    for graph in graphs:
+        graph_copy.append({
+            'image': graph['image'],
+            'chartok_coords': {
+                'coords': graph['chartok_coords']['coords'][:],
+                'symbols': graph['chartok_coords']['symbols'][:],
+            },
+            'edges': graph['edges'][:],
+            'key': graph['key'],
+        })
+    for graph_idx, atoms in relevant_locs.items():
+        for atom, atom_idx in atoms:
+            if atom in mappings:
+                graph_copy[graph_idx]['chartok_coords']['symbols'][atom_idx] = mappings[atom]
+    reaction_copy = {}
+    def append_copy(copy_list, entity):
+        if entity['category'] == '[Mol]':
+            copy_list.append({
+                k1: v1 for k1, v1 in entity.items()
+            })
+        else:
+            copy_list.append(entity)
+    for k, v in orig_reaction.items():
+        reaction_copy[k] = []
+        for entity in v:
+            if type(entity) == list:
+                sub_list = []
+                for e in entity:
+                    append_copy(sub_list, e)
+                reaction_copy[k].append(sub_list)
+            else:
+                append_copy(reaction_copy[k], entity)
+    for graph in graph_copy:
+        output = molscribe.convert_graph_to_output([graph], [graph['image']])
+        molecule = reaction_copy[graph['key'][0]][graph['key'][1]]
+        molecule['smiles'] = output[0]['smiles']
+        molecule['molfile'] = output[0]['molfile']
+    return reaction_copy
+def get_sites(tar, ref, ref_site = False):
+    rdDepictor.Compute2DCoords(ref)
+    rdDepictor.Compute2DCoords(tar)
+    idx_pair = rdDepictor.GenerateDepictionMatching2DStructure(tar, ref)
+    in_template = [i[1] for i in idx_pair]
+    sites = []
+    for i in range(tar.GetNumAtoms()):
+        if i not in in_template:
+            for j in tar.GetAtomWithIdx(i).GetNeighbors():
+                if j.GetIdx() in in_template and j.GetIdx() not in sites:
+                    if ref_site: sites.append(idx_pair[in_template.index(j.GetIdx())][0])
+                    else: sites.append(idx_pair[in_template.index(j.GetIdx())][0])
+    return sites
+def get_atom_mapping(prod_mol, prod_smiles, r_sites_reversed = None):
+    # returns prod_mol_to_query which is the mapping of atom indices in prod_mol to the atom indices of the molecule represented by prod_smiles
+    prod_template_intermediate = Chem.MolToSmiles(prod_mol)
+    prod_template = prod_smiles
+    for r in RGROUP_SMILES:
+        if r!='*' and r!='(*)':
+            prod_template = prod_template.replace(r, '*')
+            prod_template_intermediate = prod_template_intermediate.replace(r, '*')
+    prod_template_intermediate_mol = Chem.MolFromSmiles(prod_template_intermediate)
+    prod_template_mol = Chem.MolFromSmiles(prod_template)
+    p = Chem.AdjustQueryParameters.NoAdjustments()
+    p.makeDummiesQueries = True
+    prod_template_mol_query = Chem.AdjustQueryProperties(prod_template_mol, p)
+    prod_template_intermediate_mol_query = Chem.AdjustQueryProperties(prod_template_intermediate_mol, p)
+    rdDepictor.Compute2DCoords(prod_mol)
+    rdDepictor.Compute2DCoords(prod_template_mol_query)
+    rdDepictor.Compute2DCoords(prod_template_intermediate_mol_query)
+    idx_pair = rdDepictor.GenerateDepictionMatching2DStructure(prod_mol, prod_template_intermediate_mol_query)
+    intermdiate_to_prod_mol = {a:b for a,b in idx_pair}
+    prod_mol_to_intermediate = {b:a for a,b in idx_pair}
+    #idx_pair_2 = rdDepictor.GenerateDepictionMatching2DStructure(prod_template_mol_query, prod_template_intermediate_mol_query)
+    #intermediate_to_query = {a:b for a,b in idx_pair_2}
+    #query_to_intermediate = {b:a for a,b in idx_pair_2}
+    #prod_mol_to_query = {a:intermediate_to_query[prod_mol_to_intermediate[a]] for a in prod_mol_to_intermediate}
+    substructs = prod_template_mol_query.GetSubstructMatches(prod_template_intermediate_mol_query, uniquify = False)
+    #idx_pair_2 = rdDepictor.GenerateDepictionMatching2DStructure(prod_template_mol_query, prod_template_intermediate_mol_query)
+    for substruct in substructs:
+        intermediate_to_query = {a:b for a, b in enumerate(substruct)}
+        query_to_intermediate = {intermediate_to_query[i]: i for i in intermediate_to_query}
+        prod_mol_to_query = {a:intermediate_to_query[prod_mol_to_intermediate[a]] for a in prod_mol_to_intermediate}
+        good_map = True
+        for i in r_sites_reversed:
+            if prod_template_mol_query.GetAtomWithIdx(prod_mol_to_query[i]).GetSymbol() not in RGROUP_SMILES:
+                good_map = False
+        if good_map:
+            break
+    return prod_mol_to_query, prod_template_mol_query
+def clean_corefs(coref_results_dict, idx):
+    label_pattern = rf'{re.escape(idx)}[a-zA-Z]+'
+    #unclean_pattern = re.escape(idx) + r'\d(?![\d% ])'
+    toreturn = {}
+    for prod in coref_results_dict:
+        has_good_label = False
+        for parsed in coref_results_dict[prod]:
+            if re.search(label_pattern, parsed):
+                has_good_label = True
+        if not has_good_label:
+            for parsed in coref_results_dict[prod]:
+                if idx+'1' in parsed:
+                    coref_results_dict[prod].append(idx+'l')
+                elif idx+'0' in parsed:
+                    coref_results_dict[prod].append(idx+'o')
+                elif idx+'5' in parsed:
+                    coref_results_dict[prod].append(idx+'s')
+                elif idx+'9' in parsed:
+                    coref_results_dict[prod].append(idx+'g')
+def expand_r_group_label_helper(res, coref_smiles_to_graphs, other_prod, molscribe):
+    name = res.group('name')
+    group = res.group('group')
+    #print(other_prod)
+    atoms = coref_smiles_to_graphs[other_prod]['atoms']
+    bonds = coref_smiles_to_graphs[other_prod]['bonds']
+    #print(atoms, bonds)
+    graph = {
+        'image': None,
+        'chartok_coords': {
+            'coords': [],
+            'symbols': [],
+        },
+        'edges': [],
+        'key': None
+    }
+    for atom in atoms:
+        graph['chartok_coords']['coords'].append((atom['x'], atom['y']))
+        graph['chartok_coords']['symbols'].append(atom['atom_symbol'])
+    graph['edges'] = [[0] * len(atoms) for _ in range(len(atoms))]
+    for bond in bonds:
+        i, j = bond['endpoint_atoms']
+        graph['edges'][i][j] = BOND_TO_INT[bond['bond_type']]
+        graph['edges'][j][i] = BOND_TO_INT[bond['bond_type']]
+    for i, symbol in enumerate(graph['chartok_coords']['symbols']):
+        if symbol[1:-1] == name:
+            graph['chartok_coords']['symbols'][i] = group
+    #print(graph)
+    o = molscribe.convert_graph_to_output([graph], [graph['image']])
+    return Chem.MolFromSmiles(o[0]['smiles'])
+def get_r_group_frags_and_substitute(other_prod_mol, query, reactant_mols, reactant_information, parsed, toreturn):
+    prod_template_mol_query, r_sites_reversed_new, h_sites, num_r_groups = query
+    # we get the substruct matches. note that we set uniquify to false since the order matters for our method
+    substructs = other_prod_mol.GetSubstructMatches(prod_template_mol_query, uniquify = False)
+    #for r in r_sites_reversed:
+    #    print(prod_template_mol_query.GetAtomWithIdx(prod_mol_to_query[r]).GetSymbol())
+    # for each substruct we create the mapping of the substruct onto the other_mol
+    # delete all the molecules in other_mol correspond to the substruct
+    # and check if they number of mol frags is equal to number of r groups
+    # we do this to make sure we have the correct substruct
+    if len(substructs) >= 1:
+        for substruct in substructs:
+            query_to_other = {a:b for a,b in enumerate(substruct)}
+            other_to_query = {query_to_other[i]:i for i in query_to_other}
+            editable = Chem.EditableMol(other_prod_mol)
+            r_site_correspondence = []
+            for r in r_sites_reversed_new:
+                #get its id in substruct
+                substruct_id = query_to_other[r]
+                r_site_correspondence.append([substruct_id, r_sites_reversed_new[r]])
+            for idx in tuple(sorted(substruct, reverse = True)):
+                if idx not in [query_to_other[i] for i in r_sites_reversed_new]:
+                    editable.RemoveAtom(idx)
+                    for r_site in r_site_correspondence:
+                        if idx < r_site[0]:
+                            r_site[0]-=1
+            other_prod_removed = editable.GetMol()
+            if len(Chem.GetMolFrags(other_prod_removed, asMols = False)) == num_r_groups:
+                break
+        # need to compute the sites at which correspond to each r_site_reversed
+        r_site_correspondence.sort(key = lambda x: x[0])
+        f = []
+        ff = []
+        frags = Chem.GetMolFrags(other_prod_removed, asMols = True, frags = f, fragsMolAtomMapping = ff)
+        # r_group_information maps r group name --> the fragment/molcule corresponding to the r group and the atom index it should be connected at
+        r_group_information = {}
+        #tosubtract = 0
+        for idx, r_site in enumerate(r_site_correspondence):
+            r_group_information[r_site[1]]= (frags[f[r_site[0]]], ff[f[r_site[0]]].index(r_site[0]))
+            #tosubtract += len(ff[idx])
+        for r_site in h_sites:
+            r_group_information[r_site] = (Chem.MolFromSmiles('[H]'), 0)
+        # now we modify all of the reactants according to the R groups we have found
+        # for every reactant we disconnect its r group symbol, and connect it to the r group
+        modify_reactants = copy.deepcopy(reactant_mols)
+        modified_reactant_smiles = []
+        for reactant_idx in reactant_information:
+            if len(reactant_information[reactant_idx]) == 0:
+                modified_reactant_smiles.append(Chem.MolToSmiles(modify_reactants[reactant_idx]))
+            else:
+                combined = reactant_mols[reactant_idx]
+                if combined.GetNumAtoms() == 1:
+                    r_group, _, _ = reactant_information[reactant_idx][0]
+                    modified_reactant_smiles.append(Chem.MolToSmiles(r_group_information[r_group][0]))
+                else:
+                    for r_group, r_index, connect_index in reactant_information[reactant_idx]:
+                        combined = Chem.CombineMols(combined, r_group_information[r_group][0])
+                    editable = Chem.EditableMol(combined)
+                    atomIdxAdder = reactant_mols[reactant_idx].GetNumAtoms()
+                    for r_group, r_index, connect_index in reactant_information[reactant_idx]:
+                        Chem.EditableMol.RemoveBond(editable, r_index, connect_index)
+                        Chem.EditableMol.AddBond(editable, connect_index, atomIdxAdder + r_group_information[r_group][1], Chem.BondType.SINGLE)
+                        atomIdxAdder += r_group_information[r_group][0].GetNumAtoms()
+                    r_indices = [i[1] for i in reactant_information[reactant_idx]]
+                    r_indices.sort(reverse = True)
+                    for r_index in r_indices:
+                        Chem.EditableMol.RemoveAtom(editable, r_index)
+                    modified_reactant_smiles.append(Chem.MolToSmiles(Chem.MolFromSmiles(Chem.MolToSmiles(editable.GetMol()))))
+        toreturn.append((modified_reactant_smiles, [Chem.MolToSmiles(other_prod_mol)], parsed))
+        return True
+    else:
+        return False
+def query_enumeration(prod_template_mol_query, r_sites_reversed_new, num_r_groups):
+    subsets = generate_subsets(num_r_groups)
+    toreturn = []
+    for subset in subsets:
+        r_sites_list = [[i, r_sites_reversed_new[i]] for i in r_sites_reversed_new]
+        r_sites_list.sort(key = lambda x: x[0])
+        to_edit = Chem.EditableMol(prod_template_mol_query)
+        for entry in subset:
+            pos = r_sites_list[entry][0]
+            Chem.EditableMol.RemoveBond(to_edit, r_sites_list[entry][0], prod_template_mol_query.GetAtomWithIdx(r_sites_list[entry][0]).GetNeighbors()[0].GetIdx())
+        for entry in subset:
+            pos = r_sites_list[entry][0]
+            Chem.EditableMol.RemoveAtom(to_edit, pos)
+        edited = to_edit.GetMol()
+        for entry in subset:
+            for i in range(entry + 1, num_r_groups):
+                r_sites_list[i][0]-=1
+        new_r_sites = {}
+        new_h_sites = set()
+        for i in range(num_r_groups):
+            if i not in subset:
+                new_r_sites[r_sites_list[i][0]] = r_sites_list[i][1]
+            else:
+                new_h_sites.add(r_sites_list[i][1])
+        toreturn.append((edited, new_r_sites, new_h_sites, num_r_groups - len(subset)))
+    return toreturn
+def generate_subsets(n):
+    def backtrack(start, subset):
+        result.append(subset[:])
+        for i in range(start, -1, -1):  # Iterate in reverse order
+            subset.append(i)
+            backtrack(i - 1, subset)
+            subset.pop()
+    result = []
+    backtrack(n - 1, [])
+    return sorted(result, key=lambda x: (-len(x), x), reverse=True)
+def backout(results, coref_results, molscribe):
+    toreturn = []
+    if not results or not results[0]['reactions'] or not coref_results:
+        return toreturn
+    try:
+        reactants = results[0]['reactions'][0]['reactants']
+        products = [i['smiles'] for i in results[0]['reactions'][0]['products']]
+        coref_results_dict = {coref_results[0]['bboxes'][coref[0]]['smiles']: coref_results[0]['bboxes'][coref[1]]['text']  for coref in coref_results[0]['corefs']}
+        coref_smiles_to_graphs = {coref_results[0]['bboxes'][coref[0]]['smiles']: coref_results[0]['bboxes'][coref[0]]  for coref in coref_results[0]['corefs']}
+        if len(products) == 1:
+            if products[0] not in coref_results_dict:
+                print("Warning: No Label Parsed")
+                return
+            product_labels = coref_results_dict[products[0]]
+            prod = products[0]
+            label_idx = product_labels[0]
+            '''
+            if len(product_labels) == 1:
+                # get the coreference label of the product molecule
+                label_idx = product_labels[0]
+            else:
+                print("Warning: Malformed Label Parsed.")
+                return
+            '''
+        else:
+            print("Warning: More than one product detected")
+            return
+        # format the regular expression for labels that correspond to the product label
+        numbers = re.findall(r'\d+', label_idx)
+        label_idx = numbers[0] if len(numbers) > 0 else ""
+        label_pattern = rf'{re.escape(label_idx)}[a-zA-Z]+'
+        prod_smiles = prod
+        prod_mol = Chem.MolFromMolBlock(results[0]['reactions'][0]['products'][0]['molfile'])
+        # identify the atom indices of the R groups in the product tempalte
+        h_counter = 0
+        r_sites = {}
+        for idx, atom in enumerate(results[0]['reactions'][0]['products'][0]['atoms']):
+            sym = atom['atom_symbol']
+            if sym == '[H]':
+                h_counter += 1
+            if sym[0] == '[':
+                sym = sym[1:-1]
+                if sym[0] == 'R' and sym[1:].isdigit():
+                    sym = sym[1:]+"*"
+                sym = f'[{sym}]'
+            if sym in RGROUP_SYMBOLS:
+                if sym not in r_sites:
+                    r_sites[sym] = [idx-h_counter]
+                else:
+                    r_sites[sym].append(idx-h_counter)
+        r_sites_reversed = {}
+        for sym in r_sites:
+            for pos in r_sites[sym]:
+                r_sites_reversed[pos] = sym
+        num_r_groups = len(r_sites_reversed)
+        #prepare the product template and get the associated mapping
+        prod_mol_to_query, prod_template_mol_query = get_atom_mapping(prod_mol, prod_smiles, r_sites_reversed = r_sites_reversed)
+        reactant_mols = []
+        #--------------process the reactants-----------------
+        reactant_information = {} #index of relevant reaction --> [[R group name, atom index of R group, atom index of R group connection], ...]
+        for idx, reactant in enumerate(reactants):
+            reactant_information[idx] = []
+            reactant_mols.append(Chem.MolFromSmiles(reactant['smiles']))
+            has_r = False
+            r_sites_reactant = {}
+            h_counter = 0
+            for a_idx, atom in enumerate(reactant['atoms']):
+                #go through all atoms and check if they are an R group, if so add it to reactant information
+                sym = atom['atom_symbol']
+                if sym == '[H]':
+                    h_counter += 1
+                if sym[0] == '[':
+                    sym = sym[1:-1]
+                    if sym[0] == 'R' and sym[1:].isdigit():
+                        sym = sym[1:]+"*"
+                    sym = f'[{sym}]'
+                if sym in r_sites:
+                    if reactant_mols[-1].GetNumAtoms()==1:
+                        reactant_information[idx].append([sym, -1, -1])
+                    else:
+                        has_r = True
+                        reactant_mols[-1] = Chem.MolFromMolBlock(reactant['molfile'])
+                        reactant_information[idx].append([sym, a_idx-h_counter, [i.GetIdx() for i in reactant_mols[-1].GetAtomWithIdx(a_idx-h_counter).GetNeighbors()][0]])
+                        r_sites_reactant[sym] = a_idx-h_counter
+                elif sym == '[1*]' and '[7*]' in r_sites:
+                    if reactant_mols[-1].GetNumAtoms()==1:
+                        reactant_information[idx].append(['[7*]', -1, -1])
+                    else:
+                        has_r = True
+                        reactant_mols[-1] = Chem.MolFromMolBlock(reactant['molfile'])
+                        reactant_information[idx].append(['[7*]', a_idx-h_counter, [i.GetIdx() for i in reactant_mols[-1].GetAtomWithIdx(a_idx-h_counter).GetNeighbors()][0]])
+                        r_sites_reactant['[7*]'] = a_idx-h_counter
+                elif sym == '[7*]' and '[1*]' in r_sites:
+                    if reactant_mols[-1].GetNumAtoms()==1:
+                        reactant_information[idx].append(['[1*]', -1, -1])
+                    else:
+                        has_r = True
+                        reactant_mols[-1] = Chem.MolFromMolBlock(reactant['molfile'])
+                        reactant_information[idx].append(['[1*]', a_idx-h_counter, [i.GetIdx() for i in reactant_mols[-1].GetAtomWithIdx(a_idx-h_counter).GetNeighbors()][0]])
+                        r_sites_reactant['[1*]'] = a_idx-h_counter
+                elif sym == '[1*]' and '[Rf]' in r_sites:
+                    if reactant_mols[-1].GetNumAtoms()==1:
+                        reactant_information[idx].append(['[Rf]', -1, -1])
+                    else:
+                        has_r = True
+                        reactant_mols[-1] = Chem.MolFromMolBlock(reactant['molfile'])
+                        reactant_information[idx].append(['[Rf]', a_idx-h_counter, [i.GetIdx() for i in reactant_mols[-1].GetAtomWithIdx(a_idx-h_counter).GetNeighbors()][0]])
+                        r_sites_reactant['[Rf]'] = a_idx-h_counter
+                elif sym == '[Rf]' and '[1*]' in r_sites:
+                    if reactant_mols[-1].GetNumAtoms()==1:
+                        reactant_information[idx].append(['[1*]', -1, -1])
+                    else:
+                        has_r = True
+                        reactant_mols[-1] = Chem.MolFromMolBlock(reactant['molfile'])
+                        reactant_information[idx].append(['[1*]', a_idx-h_counter, [i.GetIdx() for i in reactant_mols[-1].GetAtomWithIdx(a_idx-h_counter).GetNeighbors()][0]])
+                        r_sites_reactant['[1*]'] = a_idx-h_counter
+            r_sites_reversed_reactant = {r_sites_reactant[i]: i for i in r_sites_reactant}
+            # if the reactant had r groups, we had to use the molecule generated from the MolBlock.
+            # but the molblock may have unexpanded elemeents that are not R groups
+            # so we have to map back the r group indices in the molblock version to the full molecule generated by the smiles
+            # and adjust the indices of the r groups accordingly
+            if has_r:
+                #get the mapping
+                reactant_mol_to_query, _ = get_atom_mapping(reactant_mols[-1], reactant['smiles'], r_sites_reversed = r_sites_reversed_reactant)
+                #make the adjustment
+                for info in reactant_information[idx]:
+                    info[1] = reactant_mol_to_query[info[1]]
+                    info[2] = reactant_mol_to_query[info[2]]
+                reactant_mols[-1] = Chem.MolFromSmiles(reactant['smiles'])
+        #go through all the molecules in the coreference
+        clean_corefs(coref_results_dict, label_idx)
+        for other_prod in coref_results_dict:
+            #check if they match the product label regex
+            found_good_label = False
+            for parsed in coref_results_dict[other_prod]:
+                if re.search(label_pattern, parsed) and not found_good_label:
+                    found_good_label = True
+                    other_prod_mol = Chem.MolFromSmiles(other_prod)
+                    if other_prod != prod_smiles and other_prod_mol is not None:
+                        #check if there are R groups to be resolved in the target product
+                        all_other_prod_mols = []
+                        r_group_sub_pattern = re.compile('(?P<name>[RXY]\d?)[ ]*=[ ]*(?P<group>\w+)')
+                        for parsed_labels in coref_results_dict[other_prod]:
+                            res = r_group_sub_pattern.search(parsed_labels)
+                            if res is not None:
+                                all_other_prod_mols.append((expand_r_group_label_helper(res, coref_smiles_to_graphs, other_prod, molscribe), parsed + parsed_labels))
+                        if len(all_other_prod_mols) == 0:
+                            if other_prod_mol is not None:
+                                all_other_prod_mols.append((other_prod_mol, parsed))
+                        for other_prod_mol, parsed in all_other_prod_mols:
+                            other_prod_frags = Chem.GetMolFrags(other_prod_mol, asMols = True)
+                            for other_prod_frag in other_prod_frags:
+                                substructs = other_prod_frag.GetSubstructMatches(prod_template_mol_query, uniquify = False)
+                                if len(substructs)>0:
+                                    other_prod_mol = other_prod_frag
+                                    break
+                            r_sites_reversed_new = {prod_mol_to_query[r]: r_sites_reversed[r] for r in r_sites_reversed}
+                            queries = query_enumeration(prod_template_mol_query, r_sites_reversed_new, num_r_groups)
+                            matched = False
+                            for query in queries:
+                                if not matched:
+                                    try:
+                                        matched = get_r_group_frags_and_substitute(other_prod_mol, query, reactant_mols, reactant_information, parsed, toreturn)
+                                    except:
+                                        pass
+    except:
+        pass
+    return toreturn
+def backout_without_coref(results, coref_results, coref_results_dict, coref_smiles_to_graphs, molscribe):
+    toreturn = []
+    if not results or not results[0]['reactions'] or not coref_results:
+        return toreturn
+    try:
+        reactants = results[0]['reactions'][0]['reactants']
+        products = [i['smiles'] for i in results[0]['reactions'][0]['products']]
+        coref_results_dict = coref_results_dict
+        coref_smiles_to_graphs = coref_smiles_to_graphs
+        if len(products) == 1:
+            if products[0] not in coref_results_dict:
+                print("Warning: No Label Parsed")
+                return
+            product_labels = coref_results_dict[products[0]]
+            prod = products[0]
+            label_idx = product_labels[0]
+            '''
+            if len(product_labels) == 1:
+                # get the coreference label of the product molecule
+                label_idx = product_labels[0]
+            else:
+                print("Warning: Malformed Label Parsed.")
+                return
+            '''
+        else:
+            print("Warning: More than one product detected")
+            return
+        # format the regular expression for labels that correspond to the product label
+        numbers = re.findall(r'\d+', label_idx)
+        label_idx = numbers[0] if len(numbers) > 0 else ""
+        label_pattern = rf'{re.escape(label_idx)}[a-zA-Z]+'
+        prod_smiles = prod
+        prod_mol = Chem.MolFromMolBlock(results[0]['reactions'][0]['products'][0]['molfile'])
+        # identify the atom indices of the R groups in the product tempalte
+        h_counter = 0
+        r_sites = {}
+        for idx, atom in enumerate(results[0]['reactions'][0]['products'][0]['atoms']):
+            sym = atom['atom_symbol']
+            if sym == '[H]':
+                h_counter += 1
+            if sym[0] == '[':
+                sym = sym[1:-1]
+                if sym[0] == 'R' and sym[1:].isdigit():
+                    sym = sym[1:]+"*"
+                sym = f'[{sym}]'
+            if sym in RGROUP_SYMBOLS:
+                if sym not in r_sites:
+                    r_sites[sym] = [idx-h_counter]
+                else:
+                    r_sites[sym].append(idx-h_counter)
+        r_sites_reversed = {}
+        for sym in r_sites:
+            for pos in r_sites[sym]:
+                r_sites_reversed[pos] = sym
+        num_r_groups = len(r_sites_reversed)
+        #prepare the product template and get the associated mapping
+        prod_mol_to_query, prod_template_mol_query = get_atom_mapping(prod_mol, prod_smiles, r_sites_reversed = r_sites_reversed)
+        reactant_mols = []
+        #--------------process the reactants-----------------
+        reactant_information = {} #index of relevant reaction --> [[R group name, atom index of R group, atom index of R group connection], ...]
+        for idx, reactant in enumerate(reactants):
+            reactant_information[idx] = []
+            reactant_mols.append(Chem.MolFromSmiles(reactant['smiles']))
+            has_r = False
+            r_sites_reactant = {}
+            h_counter = 0
+            for a_idx, atom in enumerate(reactant['atoms']):
+                #go through all atoms and check if they are an R group, if so add it to reactant information
+                sym = atom['atom_symbol']
+                if sym == '[H]':
+                    h_counter += 1
+                if sym[0] == '[':
+                    sym = sym[1:-1]
+                    if sym[0] == 'R' and sym[1:].isdigit():
+                        sym = sym[1:]+"*"
+                    sym = f'[{sym}]'
+                if sym in r_sites:
+                    if reactant_mols[-1].GetNumAtoms()==1:
+                        reactant_information[idx].append([sym, -1, -1])
+                    else:
+                        has_r = True
+                        reactant_mols[-1] = Chem.MolFromMolBlock(reactant['molfile'])
+                        reactant_information[idx].append([sym, a_idx-h_counter, [i.GetIdx() for i in reactant_mols[-1].GetAtomWithIdx(a_idx-h_counter).GetNeighbors()][0]])
+                        r_sites_reactant[sym] = a_idx-h_counter
+                elif sym == '[1*]' and '[7*]' in r_sites:
+                    if reactant_mols[-1].GetNumAtoms()==1:
+                        reactant_information[idx].append(['[7*]', -1, -1])
+                    else:
+                        has_r = True
+                        reactant_mols[-1] = Chem.MolFromMolBlock(reactant['molfile'])
+                        reactant_information[idx].append(['[7*]', a_idx-h_counter, [i.GetIdx() for i in reactant_mols[-1].GetAtomWithIdx(a_idx-h_counter).GetNeighbors()][0]])
+                        r_sites_reactant['[7*]'] = a_idx-h_counter
+                elif sym == '[7*]' and '[1*]' in r_sites:
+                    if reactant_mols[-1].GetNumAtoms()==1:
+                        reactant_information[idx].append(['[1*]', -1, -1])
+                    else:
+                        has_r = True
+                        reactant_mols[-1] = Chem.MolFromMolBlock(reactant['molfile'])
+                        reactant_information[idx].append(['[1*]', a_idx-h_counter, [i.GetIdx() for i in reactant_mols[-1].GetAtomWithIdx(a_idx-h_counter).GetNeighbors()][0]])
+                        r_sites_reactant['[1*]'] = a_idx-h_counter
+                elif sym == '[1*]' and '[Rf]' in r_sites:
+                    if reactant_mols[-1].GetNumAtoms()==1:
+                        reactant_information[idx].append(['[Rf]', -1, -1])
+                    else:
+                        has_r = True
+                        reactant_mols[-1] = Chem.MolFromMolBlock(reactant['molfile'])
+                        reactant_information[idx].append(['[Rf]', a_idx-h_counter, [i.GetIdx() for i in reactant_mols[-1].GetAtomWithIdx(a_idx-h_counter).GetNeighbors()][0]])
+                        r_sites_reactant['[Rf]'] = a_idx-h_counter
+                elif sym == '[Rf]' and '[1*]' in r_sites:
+                    if reactant_mols[-1].GetNumAtoms()==1:
+                        reactant_information[idx].append(['[1*]', -1, -1])
+                    else:
+                        has_r = True
+                        reactant_mols[-1] = Chem.MolFromMolBlock(reactant['molfile'])
+                        reactant_information[idx].append(['[1*]', a_idx-h_counter, [i.GetIdx() for i in reactant_mols[-1].GetAtomWithIdx(a_idx-h_counter).GetNeighbors()][0]])
+                        r_sites_reactant['[1*]'] = a_idx-h_counter
+            r_sites_reversed_reactant = {r_sites_reactant[i]: i for i in r_sites_reactant}
+            # if the reactant had r groups, we had to use the molecule generated from the MolBlock.
+            # but the molblock may have unexpanded elemeents that are not R groups
+            # so we have to map back the r group indices in the molblock version to the full molecule generated by the smiles
+            # and adjust the indices of the r groups accordingly
+            if has_r:
+                #get the mapping
+                reactant_mol_to_query, _ = get_atom_mapping(reactant_mols[-1], reactant['smiles'], r_sites_reversed = r_sites_reversed_reactant)
+                #make the adjustment
+                for info in reactant_information[idx]:
+                    info[1] = reactant_mol_to_query[info[1]]
+                    info[2] = reactant_mol_to_query[info[2]]
+                reactant_mols[-1] = Chem.MolFromSmiles(reactant['smiles'])
+        #go through all the molecules in the coreference
+        clean_corefs(coref_results_dict, label_idx)
+        for other_prod in coref_results_dict:
+            #check if they match the product label regex
+            found_good_label = False
+            for parsed in coref_results_dict[other_prod]:
+                if re.search(label_pattern, parsed) and not found_good_label:
+                    found_good_label = True
+                    other_prod_mol = Chem.MolFromSmiles(other_prod)
+                    if other_prod != prod_smiles and other_prod_mol is not None:
+                        #check if there are R groups to be resolved in the target product
+                        all_other_prod_mols = []
+                        r_group_sub_pattern = re.compile('(?P<name>[RXY]\d?)[ ]*=[ ]*(?P<group>\w+)')
+                        for parsed_labels in coref_results_dict[other_prod]:
+                            res = r_group_sub_pattern.search(parsed_labels)
+                            if res is not None:
+                                all_other_prod_mols.append((expand_r_group_label_helper(res, coref_smiles_to_graphs, other_prod, molscribe), parsed + parsed_labels))
+                        if len(all_other_prod_mols) == 0:
+                            if other_prod_mol is not None:
+                                all_other_prod_mols.append((other_prod_mol, parsed))
+                        for other_prod_mol, parsed in all_other_prod_mols:
+                            other_prod_frags = Chem.GetMolFrags(other_prod_mol, asMols = True)
+                            for other_prod_frag in other_prod_frags:
+                                substructs = other_prod_frag.GetSubstructMatches(prod_template_mol_query, uniquify = False)
+                                if len(substructs)>0:
+                                    other_prod_mol = other_prod_frag
+                                    break
+                            r_sites_reversed_new = {prod_mol_to_query[r]: r_sites_reversed[r] for r in r_sites_reversed}
+                            queries = query_enumeration(prod_template_mol_query, r_sites_reversed_new, num_r_groups)
+                            matched = False
+                            for query in queries:
+                                if not matched:
+                                    try:
+                                        matched = get_r_group_frags_and_substitute(other_prod_mol, query, reactant_mols, reactant_information, parsed, toreturn)
+                                    except:
+                                        pass
+    except:
+        pass
+    return toreturn
+def associate_corefs(results, results_coref):
+    coref_smiles = {}
+    idx_pattern = r'\b\d+[a-zA-Z]{0,2}\b'
+    for result_coref in results_coref:
+        bboxes, corefs = result_coref['bboxes'], result_coref['corefs']
+        for coref in corefs:
+            mol, idt = coref[0], coref[1]
+            if len(bboxes[idt]['text']) > 0:
+                for text in bboxes[idt]['text']:
+                    matches = re.findall(idx_pattern, text)
+                    for match in matches:
+                        coref_smiles[match] = bboxes[mol]['smiles']
+    for page in results:
+        for reactions in page['reactions']:
+            for reaction in reactions['reactions']:
+                if 'Reactants' in reaction:
+                    if isinstance(reaction['Reactants'], tuple):
+                        if reaction['Reactants'][0] in coref_smiles:
+                            reaction['Reactants'] = (f'{reaction["Reactants"][0]} ({coref_smiles[reaction["Reactants"][0]]})', reaction['Reactants'][1], reaction['Reactants'][2])
+                    else:
+                        for idx, compound in enumerate(reaction['Reactants']):
+                            if compound[0] in coref_smiles:
+                                reaction['Reactants'][idx] = (f'{compound[0]} ({coref_smiles[compound[0]]})', compound[1], compound[2])
+                if 'Product' in reaction:
+                    if isinstance(reaction['Product'], tuple):
+                        if reaction['Product'][0] in coref_smiles:
+                            reaction['Product'] = (f'{reaction["Product"][0]} ({coref_smiles[reaction["Product"][0]]})', reaction['Product'][1], reaction['Product'][2])
+                    else:
+                        for idx, compound in enumerate(reaction['Product']):
+                            if compound[0] in coref_smiles:
+                                reaction['Product'][idx] = (f'{compound[0]} ({coref_smiles[compound[0]]})', compound[1], compound[2])
+    return results
+def expand_reactions_with_backout(initial_results, results_coref, molscribe):
+    idx_pattern = r'^\d+[a-zA-Z]{0,2}$'
+    for reactions, result_coref in zip(initial_results, results_coref):
+        if not reactions['reactions']:
+            continue
+        try:
+            backout_results = backout([reactions], [result_coref], molscribe)
+        except Exception:
+            continue
+        conditions = reactions['reactions'][0]['conditions']
+        idt_to_smiles = {}
+        if not backout_results:
+            continue
+        for reactants, products, idt in backout_results:
+            reactions['reactions'].append({
+                'reactants': [{'category': '[Mol]', 'molfile': None, 'smiles': reactant} for reactant in reactants],
+                'conditions': conditions[:],
+                'products': [{'category': '[Mol]', 'molfile': None, 'smiles': product} for product in products]
+             })
+    return initial_results

examples/exp.png ADDED Viewed

Git LFS Details

SHA256: 3ce344ed33ff77f45d6e87a29e91426c3444ee9b58a8b10086ce3483a1ad2a2e
Pointer size: 131 Bytes
Size of remote file: 696 kB

examples/image.webp ADDED Viewed

examples/rdkit.png ADDED Viewed

examples/reaction1.jpg ADDED Viewed

examples/reaction2.png ADDED Viewed

examples/reaction3.png ADDED Viewed

examples/reaction4.png ADDED Viewed

Git LFS Details

SHA256: 341a3b9f6b24b3fe3793186ec198cf1171ffb84bca6c0316052f25e17c0eeb55
Pointer size: 131 Bytes
Size of remote file: 232 kB

get_molecular_agent.py ADDED Viewed

	@@ -0,0 +1,599 @@

+import sys
+import torch
+import json
+from chemietoolkit import ChemIEToolkit
+import cv2
+from PIL import Image
+import json
+import sys
+#sys.path.append('./RxnScribe-main/')
+import torch
+from rxnscribe import RxnScribe
+import json
+import sys
+import torch
+import json
+model = ChemIEToolkit(device=torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
+from molscribe.chemistry import _convert_graph_to_smiles
+import base64
+import torch
+import json
+from PIL import Image
+import numpy as np
+from chemietoolkit import ChemIEToolkit, utils
+from openai import AzureOpenAI
+import os
+ckpt_path = "./pix2seq_reaction_full.ckpt"
+model1 = RxnScribe(ckpt_path, device=torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
+device = torch.device(('cuda' if torch.cuda.is_available() else 'cpu'))
+model = ChemIEToolkit(device=torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
+def get_multi_molecular(image_path: str) -> list:
+    '''Returns a list of reactions extracted from the image.'''
+    # 打开图像文件
+    image = Image.open(image_path).convert('RGB')
+    # 将图像作为输入传递给模型
+    coref_results = model.extract_molecule_corefs_from_figures([image])
+    for item in coref_results:
+        for bbox in item.get("bboxes", []):
+            for key in ["category", "molfile", "symbols", 'atoms', "bonds", 'category_id', 'score', 'corefs']: #'atoms'
+                bbox.pop(key, None)  # 安全地移除键
+    print(json.dumps(coref_results))
+    # 返回反应列表，使用 json.dumps 进行格式化
+    return json.dumps(coref_results)
+def get_multi_molecular_text_to_correct(image_path: str) -> list:
+    '''Returns a list of reactions extracted from the image.'''
+    # 打开图像文件
+    image = Image.open(image_path).convert('RGB')
+    # 将图像作为输入传递给模型
+    coref_results = model.extract_molecule_corefs_from_figures([image])
+    for item in coref_results:
+        for bbox in item.get("bboxes", []):
+            for key in ["category", "bbox", "molfile", "symbols", 'atoms', "bonds", 'category_id', 'score', 'corefs']: #'atoms'
+                bbox.pop(key, None)  # 安全地移除键
+    print(json.dumps(coref_results))
+    # 返回反应列表，使用 json.dumps 进行格式化
+    return json.dumps(coref_results)
+def get_multi_molecular_text_to_correct_withatoms(image_path: str) -> list:
+    '''Returns a list of reactions extracted from the image.'''
+    # 打开图像文件
+    image = Image.open(image_path).convert('RGB')
+    # 将图像作为输入传递给模型
+    coref_results = model.extract_molecule_corefs_from_figures([image])
+    for item in coref_results:
+        for bbox in item.get("bboxes", []):
+            for key in ["coords","edges","molfile", 'atoms', "bonds", 'category_id', 'score', 'corefs']: #'atoms'
+                bbox.pop(key, None)  # 安全地移除键
+    print(json.dumps(coref_results))
+    # 返回反应列表，使用 json.dumps 进行格式化
+    return json.dumps(coref_results)
+def process_reaction_image_with_multiple_products_and_text(image_path: str) -> dict:
+    """
+    Args:
+        image_path (str): 图像文件路径。
+    Returns:
+        dict: 整理后的反应数据，包括反应物、产物和反应模板。
+    """
+    # 配置 API Key 和 Azure Endpoint
+    api_key = os.getenv("CHEMEAGLE_API_KEY")
+    if not api_key:
+        raise RuntimeError("Missing CHEMEAGLE_API_KEY environment variable")
+ # 替换为实际的 API Key
+    azure_endpoint = "https://hkust.azure-api.net"  # 替换为实际的 Azure Endpoint
+    model = ChemIEToolkit(device=torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
+    client = AzureOpenAI(
+        api_key=api_key,
+        api_version='2024-06-01',
+        azure_endpoint=azure_endpoint
+    )
+    # 加载图像并编码为 Base64
+    def encode_image(image_path: str):
+        with open(image_path, "rb") as image_file:
+            return base64.b64encode(image_file.read()).decode('utf-8')
+    base64_image = encode_image(image_path)
+    # GPT 工具调用配置
+    tools = [
+       {
+        'type': 'function',
+        'function': {
+            'name': 'get_multi_molecular_text_to_correct_withatoms',
+            'description': 'Extracts the SMILES string, the symbols set, and the text coref of all molecular images in a table-reaction image and ready to be correct.',
+            'parameters': {
+                'type': 'object',
+                'properties': {
+                    'image_path': {
+                        'type': 'string',
+                        'description': 'The path to the reaction image.',
+                    },
+                },
+                'required': ['image_path'],
+                'additionalProperties': False,
+            },
+        },
+            },
+    ]
+    # 提供给 GPT 的消息内容
+    with open('./prompt_getmolecular.txt', 'r') as prompt_file:
+        prompt = prompt_file.read()
+    messages = [
+        {'role': 'system', 'content': 'You are a helpful assistant.'},
+        {
+            'role': 'user',
+            'content': [
+                {'type': 'text', 'text': prompt},
+                {'type': 'image_url', 'image_url': {'url': f'data:image/png;base64,{base64_image}'}}
+            ]
+        }
+    ]
+    # 调用 GPT 接口
+    response = client.chat.completions.create(
+    model = 'gpt-4o',
+    temperature = 0,
+    response_format={ 'type': 'json_object' },
+    messages = [
+        {'role': 'system', 'content': 'You are a helpful assistant.'},
+        {
+            'role': 'user',
+            'content': [
+                {
+                    'type': 'text',
+                    'text': prompt
+                },
+                {
+                    'type': 'image_url',
+                    'image_url': {
+                        'url': f'data:image/png;base64,{base64_image}'
+                    }
+                }
+            ]},
+    ],
+    tools = tools)
+# Step 1: 工具映射表
+    TOOL_MAP = {
+        'get_multi_molecular_text_to_correct_withatoms': get_multi_molecular_text_to_correct_withatoms,
+    }
+    # Step 2: 处理多个工具调用
+    tool_calls = response.choices[0].message.tool_calls
+    results = []
+    # 遍历每个工具调用
+    for tool_call in tool_calls:
+        tool_name = tool_call.function.name
+        tool_arguments = tool_call.function.arguments
+        tool_call_id = tool_call.id
+        tool_args = json.loads(tool_arguments)
+        if tool_name in TOOL_MAP:
+            # 调用工具并获取结果
+            tool_result = TOOL_MAP[tool_name](image_path)
+        else:
+            raise ValueError(f"Unknown tool called: {tool_name}")
+        # 保存每个工具调用结果
+        results.append({
+            'role': 'tool',
+            'content': json.dumps({
+                'image_path': image_path,
+                f'{tool_name}':(tool_result),
+            }),
+            'tool_call_id': tool_call_id,
+        })
+# Prepare the chat completion payload
+    completion_payload = {
+        'model': 'gpt-4o',
+        'messages': [
+            {'role': 'system', 'content': 'You are a helpful assistant.'},
+            {
+                'role': 'user',
+                'content': [
+                    {
+                        'type': 'text',
+                        'text': prompt
+                    },
+                    {
+                        'type': 'image_url',
+                        'image_url': {
+                            'url': f'data:image/png;base64,{base64_image}'
+                        }
+                    }
+                ]
+            },
+            response.choices[0].message,
+            *results
+            ],
+    }
+# Generate new response
+    response = client.chat.completions.create(
+        model=completion_payload["model"],
+        messages=completion_payload["messages"],
+        response_format={ 'type': 'json_object' },
+        temperature=0
+    )
+    # 获取 GPT 生成的结果
+    gpt_output = [json.loads(response.choices[0].message.content)]
+    def get_multi_molecular(image_path: str) -> list:
+        '''Returns a list of reactions extracted from the image.'''
+        # 打开图像文件
+        image = Image.open(image_path).convert('RGB')
+        # 将图像作为输入传递给模型
+        coref_results = model.extract_molecule_corefs_from_figures([image])
+        return coref_results
+    coref_results = get_multi_molecular(image_path)
+    def update_symbols_in_atoms(input1, input2):
+        """
+        用 input1 中更新后的 'symbols' 替换 input2 中对应 bboxes 的 'symbols'，并同步更新 'atoms' 的 'atom_symbol'。
+        假设 input1 和 input2 的结构一致。
+        """
+        for item1, item2 in zip(input1, input2):
+            bboxes1 = item1.get('bboxes', [])
+            bboxes2 = item2.get('bboxes', [])
+            if len(bboxes1) != len(bboxes2):
+                print("Warning: Mismatched number of bboxes!")
+                continue
+            for bbox1, bbox2 in zip(bboxes1, bboxes2):
+                # 更新 symbols
+                if 'symbols' in bbox1:
+                    bbox2['symbols'] = bbox1['symbols']  # 更新 symbols
+                # 更新 atoms 的 atom_symbol
+                if 'symbols' in bbox1 and 'atoms' in bbox2:
+                    symbols = bbox1['symbols']
+                    atoms = bbox2.get('atoms', [])
+                    # 确保 symbols 和 atoms 的长度一致
+                    if len(symbols) != len(atoms):
+                        print(f"Warning: Mismatched symbols and atoms in bbox {bbox1.get('bbox')}!")
+                        continue
+                    for atom, symbol in zip(atoms, symbols):
+                        atom['atom_symbol'] = symbol  # 更新 atom_symbol
+        return input2
+    input2_updated = update_symbols_in_atoms(gpt_output, coref_results)
+    def update_smiles_and_molfile(input_data, conversion_function):
+        """
+        使用更新后的 'symbols'、'coords' 和 'edges' 调用 `conversion_function` 生成新的 'smiles' 和 'molfile'，
+        并替换到原数据结构中。
+        参数:
+        - input_data: 包含 bboxes 的嵌套数据结构
+        - conversion_function: 函数，接受 'coords', 'symbols', 'edges' 并返回 (new_smiles, new_molfile, _)
+        返回:
+        - 更新后的数据结构
+        """
+        for item in input_data:
+            for bbox in item.get('bboxes', []):
+                # 检查必需的键是否存在
+                if all(key in bbox for key in ['coords', 'symbols', 'edges']):
+                    coords = bbox['coords']
+                    symbols = bbox['symbols']
+                    edges = bbox['edges']
+                    # 调用转换函数生成新的 'smiles' 和 'molfile'
+                    new_smiles, new_molfile, _ = conversion_function(coords, symbols, edges)
+                    print(f"    Generated 'smiles': {new_smiles}")
+                    # 替换旧的 'smiles' 和 'molfile'
+                    bbox['smiles'] = new_smiles
+                    bbox['molfile'] = new_molfile
+        return input_data
+    updated_data = update_smiles_and_molfile(input2_updated, _convert_graph_to_smiles)
+    return updated_data
+def process_reaction_image_with_multiple_products_and_text_correctR(image_path: str) -> dict:
+    """
+    Args:
+        image_path (str): 图像文件路径。
+    Returns:
+        dict: 整理后的反应数据，包括反应物、产物和反应模板。
+    """
+    # 配置 API Key 和 Azure Endpoint
+    api_key = os.getenv("CHEMEAGLE_API_KEY")
+    if not api_key:
+        raise RuntimeError("Missing CHEMEAGLE_API_KEY environment variable")
+  # 替换为实际的 API Key
+    azure_endpoint = "https://hkust.azure-api.net"  # 替换为实际的 Azure Endpoint
+    model = ChemIEToolkit(device=torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
+    client = AzureOpenAI(
+        api_key=api_key,
+        api_version='2024-06-01',
+        azure_endpoint=azure_endpoint
+    )
+    # 加载图像并编码为 Base64
+    def encode_image(image_path: str):
+        with open(image_path, "rb") as image_file:
+            return base64.b64encode(image_file.read()).decode('utf-8')
+    base64_image = encode_image(image_path)
+    # GPT 工具调用配置
+    tools = [
+       {
+        'type': 'function',
+        'function': {
+            'name': 'get_multi_molecular_text_to_correct_withatoms',
+            'description': 'Extracts the SMILES string, the symbols set, and the text coref of all molecular images in a table-reaction image and ready to be correct.',
+            'parameters': {
+                'type': 'object',
+                'properties': {
+                    'image_path': {
+                        'type': 'string',
+                        'description': 'The path to the reaction image.',
+                    },
+                },
+                'required': ['image_path'],
+                'additionalProperties': False,
+            },
+        },
+            },
+    ]
+    # 提供给 GPT 的消息内容
+    with open('./prompt_getmolecular_correctR.txt', 'r') as prompt_file:
+        prompt = prompt_file.read()
+    messages = [
+        {'role': 'system', 'content': 'You are a helpful assistant.'},
+        {
+            'role': 'user',
+            'content': [
+                {'type': 'text', 'text': prompt},
+                {'type': 'image_url', 'image_url': {'url': f'data:image/png;base64,{base64_image}'}}
+            ]
+        }
+    ]
+    # 调用 GPT 接口
+    response = client.chat.completions.create(
+    model = 'gpt-4o',
+    temperature = 0,
+    response_format={ 'type': 'json_object' },
+    messages = [
+        {'role': 'system', 'content': 'You are a helpful assistant.'},
+        {
+            'role': 'user',
+            'content': [
+                {
+                    'type': 'text',
+                    'text': prompt
+                },
+                {
+                    'type': 'image_url',
+                    'image_url': {
+                        'url': f'data:image/png;base64,{base64_image}'
+                    }
+                }
+            ]},
+    ],
+    tools = tools)
+# Step 1: 工具映射表
+    TOOL_MAP = {
+        'get_multi_molecular_text_to_correct_withatoms': get_multi_molecular_text_to_correct_withatoms,
+    }
+    # Step 2: 处理多个工具调用
+    tool_calls = response.choices[0].message.tool_calls
+    results = []
+    # 遍历每个工具调用
+    for tool_call in tool_calls:
+        tool_name = tool_call.function.name
+        tool_arguments = tool_call.function.arguments
+        tool_call_id = tool_call.id
+        tool_args = json.loads(tool_arguments)
+        if tool_name in TOOL_MAP:
+            # 调用工具并获取结果
+            tool_result = TOOL_MAP[tool_name](image_path)
+        else:
+            raise ValueError(f"Unknown tool called: {tool_name}")
+        # 保存每个工具调用结果
+        results.append({
+            'role': 'tool',
+            'content': json.dumps({
+                'image_path': image_path,
+                f'{tool_name}':(tool_result),
+            }),
+            'tool_call_id': tool_call_id,
+        })
+# Prepare the chat completion payload
+    completion_payload = {
+        'model': 'gpt-4o',
+        'messages': [
+            {'role': 'system', 'content': 'You are a helpful assistant.'},
+            {
+                'role': 'user',
+                'content': [
+                    {
+                        'type': 'text',
+                        'text': prompt
+                    },
+                    {
+                        'type': 'image_url',
+                        'image_url': {
+                            'url': f'data:image/png;base64,{base64_image}'
+                        }
+                    }
+                ]
+            },
+            response.choices[0].message,
+            *results
+            ],
+    }
+# Generate new response
+    response = client.chat.completions.create(
+        model=completion_payload["model"],
+        messages=completion_payload["messages"],
+        response_format={ 'type': 'json_object' },
+        temperature=0
+    )
+    # 获取 GPT 生成的结果
+    gpt_output = [json.loads(response.choices[0].message.content)]
+    def get_multi_molecular(image_path: str) -> list:
+        '''Returns a list of reactions extracted from the image.'''
+        # 打开图像文件
+        image = Image.open(image_path).convert('RGB')
+        # 将图像作为输入传递给模型
+        coref_results = model.extract_molecule_corefs_from_figures([image])
+        return coref_results
+    coref_results = get_multi_molecular(image_path)
+    def update_symbols_in_atoms(input1, input2):
+        """
+        用 input1 中更新后的 'symbols' 替换 input2 中对应 bboxes 的 'symbols'，并同步更新 'atoms' 的 'atom_symbol'。
+        假设 input1 和 input2 的结构一致。
+        """
+        for item1, item2 in zip(input1, input2):
+            bboxes1 = item1.get('bboxes', [])
+            bboxes2 = item2.get('bboxes', [])
+            if len(bboxes1) != len(bboxes2):
+                print("Warning: Mismatched number of bboxes!")
+                continue
+            for bbox1, bbox2 in zip(bboxes1, bboxes2):
+                # 更新 symbols
+                if 'symbols' in bbox1:
+                    bbox2['symbols'] = bbox1['symbols']  # 更新 symbols
+                # 更新 atoms 的 atom_symbol
+                if 'symbols' in bbox1 and 'atoms' in bbox2:
+                    symbols = bbox1['symbols']
+                    atoms = bbox2.get('atoms', [])
+                    # 确保 symbols 和 atoms 的长度一致
+                    if len(symbols) != len(atoms):
+                        print(f"Warning: Mismatched symbols and atoms in bbox {bbox1.get('bbox')}!")
+                        continue
+                    for atom, symbol in zip(atoms, symbols):
+                        atom['atom_symbol'] = symbol  # 更新 atom_symbol
+        return input2
+    input2_updated = update_symbols_in_atoms(gpt_output, coref_results)
+    def update_smiles_and_molfile(input_data, conversion_function):
+        """
+        使用更新后的 'symbols'、'coords' 和 'edges' 调用 `conversion_function` 生成新的 'smiles' 和 'molfile'，
+        并替换到原数据结构中。
+        参数:
+        - input_data: 包含 bboxes 的嵌套数据结构
+        - conversion_function: 函数，接受 'coords', 'symbols', 'edges' 并返回 (new_smiles, new_molfile, _)
+        返回:
+        - 更新后的数据结构
+        """
+        for item in input_data:
+            for bbox in item.get('bboxes', []):
+                # 检查必需的键是否存在
+                if all(key in bbox for key in ['coords', 'symbols', 'edges']):
+                    coords = bbox['coords']
+                    symbols = bbox['symbols']
+                    edges = bbox['edges']
+                    # 调用转换函数生成新的 'smiles' 和 'molfile'
+                    new_smiles, new_molfile, _ = conversion_function(coords, symbols, edges)
+                    print(f"    Generated 'smiles': {new_smiles}")
+                    # 替换旧的 'smiles' 和 'molfile'
+                    bbox['smiles'] = new_smiles
+                    bbox['molfile'] = new_molfile
+        return input_data
+    updated_data = update_smiles_and_molfile(input2_updated, _convert_graph_to_smiles)
+    print(f"updated_mol_data:{updated_data}")
+    return updated_data

get_reaction_agent.py ADDED Viewed

	@@ -0,0 +1,507 @@

+import sys
+import torch
+import json
+from chemietoolkit import ChemIEToolkit
+import cv2
+from PIL import Image
+import json
+import sys
+#sys.path.append('./RxnScribe-main/')
+import torch
+from rxnscribe import RxnScribe
+import json
+from molscribe.chemistry import _convert_graph_to_smiles
+from openai import AzureOpenAI
+import base64
+import numpy as np
+from chemietoolkit import utils
+from PIL import Image
+ckpt_path = "./pix2seq_reaction_full.ckpt"
+model1 = RxnScribe(ckpt_path, device=torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
+device = torch.device(('cuda' if torch.cuda.is_available() else 'cpu'))
+model = ChemIEToolkit(device=torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
+def get_reaction(image_path: str) -> dict:
+    '''
+    Returns a structured dictionary of reactions extracted from the image,
+    including reactants, conditions, and products, with their smiles, text, and bbox.
+    '''
+    image_file = image_path
+    raw_prediction = model1.predict_image_file(image_file, molscribe=True, ocr=True)
+    # Ensure raw_prediction is treated as a list directly
+    structured_output = {}
+    for section_key in ['reactants', 'conditions', 'products']:
+        if section_key in raw_prediction[0]:
+            structured_output[section_key] = []
+            for item in raw_prediction[0][section_key]:
+                if section_key in ['reactants', 'products']:
+                    # Extract smiles and bbox for molecules
+                    structured_output[section_key].append({
+                        "smiles": item.get("smiles", ""),
+                        "bbox": item.get("bbox", []),
+                        "symbols": item.get("symbols", [])
+                    })
+                elif section_key == 'conditions':
+                    # Extract smiles, text, and bbox for conditions
+                    condition_data = {"bbox": item.get("bbox", [])}
+                    if "smiles" in item:
+                        condition_data["smiles"] = item.get("smiles", "")
+                    if "text" in item:
+                        condition_data["text"] = item.get("text", [])
+                    structured_output[section_key].append(condition_data)
+    print(structured_output)
+    return structured_output
+def get_full_reaction(image_path: str) -> dict:
+    '''
+    Returns a structured dictionary of reactions extracted from the image,
+    including reactants, conditions, and products, with their smiles, text, and bbox.
+    '''
+    image_file = image_path
+    raw_prediction = model1.predict_image_file(image_file, molscribe=True, ocr=True)
+    return raw_prediction
+def get_reaction_withatoms(image_path: str) -> dict:
+    """
+    Args:
+        image_path (str): 图像文件路径。
+    Returns:
+        dict: 整理后的反应数据，包括反应物、产物和反应模板。
+    """
+    # 配置 API Key 和 Azure Endpoint
+    api_key = "b038da96509b4009be931e035435e022"  # 替换为实际的 API Key
+    azure_endpoint = "https://hkust.azure-api.net"  # 替换为实际的 Azure Endpoint
+    model = ChemIEToolkit(device=torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
+    client = AzureOpenAI(
+        api_key=api_key,
+        api_version='2024-06-01',
+        azure_endpoint=azure_endpoint
+    )
+    # 加载图像并编码为 Base64
+    def encode_image(image_path: str):
+        with open(image_path, "rb") as image_file:
+            return base64.b64encode(image_file.read()).decode('utf-8')
+    base64_image = encode_image(image_path)
+    # GPT 工具调用配置
+    tools = [
+        {
+        'type': 'function',
+        'function': {
+            'name': 'get_reaction',
+            'description': 'Get a list of reactions from a reaction image. A reaction contains data of the reactants, conditions, and products.',
+            'parameters': {
+                'type': 'object',
+                'properties': {
+                    'image_path': {
+                        'type': 'string',
+                        'description': 'The path to the reaction image.',
+                    },
+                },
+                'required': ['image_path'],
+                'additionalProperties': False,
+            },
+        },
+            },
+    ]
+    # 提供给 GPT 的消息内容
+    with open('./prompt_getreaction.txt', 'r') as prompt_file:
+        prompt = prompt_file.read()
+    messages = [
+        {'role': 'system', 'content': 'You are a helpful assistant.'},
+        {
+            'role': 'user',
+            'content': [
+                {'type': 'text', 'text': prompt},
+                {'type': 'image_url', 'image_url': {'url': f'data:image/png;base64,{base64_image}'}}
+            ]
+        }
+    ]
+    # 调用 GPT 接口
+    response = client.chat.completions.create(
+    model = 'gpt-4o',
+    temperature = 0,
+    response_format={ 'type': 'json_object' },
+    messages = [
+        {'role': 'system', 'content': 'You are a helpful assistant.'},
+        {
+            'role': 'user',
+            'content': [
+                {
+                    'type': 'text',
+                    'text': prompt
+                },
+                {
+                    'type': 'image_url',
+                    'image_url': {
+                        'url': f'data:image/png;base64,{base64_image}'
+                    }
+                }
+            ]},
+    ],
+    tools = tools)
+# Step 1: 工具映射表
+    TOOL_MAP = {
+        'get_reaction': get_reaction,
+    }
+    # Step 2: 处理多个工具调用
+    tool_calls = response.choices[0].message.tool_calls
+    results = []
+    # 遍历每个工具调用
+    for tool_call in tool_calls:
+        tool_name = tool_call.function.name
+        tool_arguments = tool_call.function.arguments
+        tool_call_id = tool_call.id
+        tool_args = json.loads(tool_arguments)
+        if tool_name in TOOL_MAP:
+            # 调用工具并获取结果
+            tool_result = TOOL_MAP[tool_name](image_path)
+        else:
+            raise ValueError(f"Unknown tool called: {tool_name}")
+        # 保存每个工具调用结果
+        results.append({
+            'role': 'tool',
+            'content': json.dumps({
+                'image_path': image_path,
+                f'{tool_name}':(tool_result),
+            }),
+            'tool_call_id': tool_call_id,
+        })
+# Prepare the chat completion payload
+    completion_payload = {
+        'model': 'gpt-4o',
+        'messages': [
+            {'role': 'system', 'content': 'You are a helpful assistant.'},
+            {
+                'role': 'user',
+                'content': [
+                    {
+                        'type': 'text',
+                        'text': prompt
+                    },
+                    {
+                        'type': 'image_url',
+                        'image_url': {
+                            'url': f'data:image/png;base64,{base64_image}'
+                        }
+                    }
+                ]
+            },
+            response.choices[0].message,
+            *results
+            ],
+    }
+# Generate new response
+    response = client.chat.completions.create(
+        model=completion_payload["model"],
+        messages=completion_payload["messages"],
+        response_format={ 'type': 'json_object' },
+        temperature=0
+    )
+    # 获取 GPT 生成的结果
+    gpt_output = json.loads(response.choices[0].message.content)
+    print(f"gpt_output1:{gpt_output}")
+    def get_reaction_full(image_path: str) -> dict:
+        '''
+        Returns a structured dictionary of reactions extracted from the image,
+        including reactants, conditions, and products, with their smiles, text, and bbox.
+        '''
+        image_file = image_path
+        raw_prediction = model1.predict_image_file(image_file, molscribe=True, ocr=True)
+        return raw_prediction
+    input2 = get_reaction_full(image_path)
+    def update_input_with_symbols(input1, input2, conversion_function):
+        symbol_mapping = {}
+        for key in ['reactants', 'products']:
+            for item in input1.get(key, []):
+                bbox = tuple(item['bbox'])  # 使用 bbox 作为唯一标识
+                symbol_mapping[bbox] = item['symbols']
+        for key in ['reactants', 'products']:
+            for item in input2.get(key, []):
+                bbox = tuple(item['bbox'])  # 获取 bbox 作为匹配键
+                # 如果 bbox 存在于 input1 的映射中，则更新 symbols
+                if bbox in symbol_mapping:
+                    updated_symbols = symbol_mapping[bbox]
+                    item['symbols'] = updated_symbols
+                    # 更新 atoms 的 atom_symbol
+                    if 'atoms' in item:
+                        atoms = item['atoms']
+                        if len(atoms) != len(updated_symbols):
+                            print(f"Warning: Mismatched symbols and atoms in bbox {bbox}")
+                        else:
+                            for atom, symbol in zip(atoms, updated_symbols):
+                                atom['atom_symbol'] = symbol
+                    # 如果 coords 和 edges 存在，调用转换函数生成新的 smiles 和 molfile
+                    if 'coords' in item and 'edges' in item:
+                        coords = item['coords']
+                        edges = item['edges']
+                        new_smiles, new_molfile, _ = conversion_function(coords, updated_symbols, edges)
+                        # 替换旧的 smiles 和 molfile
+                        item['smiles'] = new_smiles
+                        item['molfile'] = new_molfile
+        return input2
+    updated_data = [update_input_with_symbols(gpt_output, input2[0], _convert_graph_to_smiles)]
+    return updated_data
+def get_reaction_withatoms_correctR(image_path: str) -> dict:
+    """
+    Args:
+        image_path (str): 图像文件路径。
+    Returns:
+        dict: 整理后的反应数据，包括反应物、产物和反应模板。
+    """
+    # 配置 API Key 和 Azure Endpoint
+    api_key = "b038da96509b4009be931e035435e022"  # 替换为实际的 API Key
+    azure_endpoint = "https://hkust.azure-api.net"  # 替换为实际的 Azure Endpoint
+    model = ChemIEToolkit(device=torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
+    client = AzureOpenAI(
+        api_key=api_key,
+        api_version='2024-06-01',
+        azure_endpoint=azure_endpoint
+    )
+    # 加载图像并编码为 Base64
+    def encode_image(image_path: str):
+        with open(image_path, "rb") as image_file:
+            return base64.b64encode(image_file.read()).decode('utf-8')
+    base64_image = encode_image(image_path)
+    # GPT 工具调用配置
+    tools = [
+        {
+        'type': 'function',
+        'function': {
+            'name': 'get_reaction',
+            'description': 'Get a list of reactions from a reaction image. A reaction contains data of the reactants, conditions, and products.',
+            'parameters': {
+                'type': 'object',
+                'properties': {
+                    'image_path': {
+                        'type': 'string',
+                        'description': 'The path to the reaction image.',
+                    },
+                },
+                'required': ['image_path'],
+                'additionalProperties': False,
+            },
+        },
+            },
+    ]
+    # 提供给 GPT 的消息内容
+    with open('./prompt_getreaction_correctR.txt', 'r') as prompt_file:
+        prompt = prompt_file.read()
+    messages = [
+        {'role': 'system', 'content': 'You are a helpful assistant.'},
+        {
+            'role': 'user',
+            'content': [
+                {'type': 'text', 'text': prompt},
+                {'type': 'image_url', 'image_url': {'url': f'data:image/png;base64,{base64_image}'}}
+            ]
+        }
+    ]
+    # 调用 GPT 接口
+    response = client.chat.completions.create(
+    model = 'gpt-4o',
+    temperature = 0,
+    response_format={ 'type': 'json_object' },
+    messages = [
+        {'role': 'system', 'content': 'You are a helpful assistant.'},
+        {
+            'role': 'user',
+            'content': [
+                {
+                    'type': 'text',
+                    'text': prompt
+                },
+                {
+                    'type': 'image_url',
+                    'image_url': {
+                        'url': f'data:image/png;base64,{base64_image}'
+                    }
+                }
+            ]},
+    ],
+    tools = tools)
+# Step 1: 工具映射表
+    TOOL_MAP = {
+        'get_reaction': get_reaction,
+    }
+    # Step 2: 处理多个工具调用
+    tool_calls = response.choices[0].message.tool_calls
+    results = []
+    # 遍历每个工具调用
+    for tool_call in tool_calls:
+        tool_name = tool_call.function.name
+        tool_arguments = tool_call.function.arguments
+        tool_call_id = tool_call.id
+        tool_args = json.loads(tool_arguments)
+        if tool_name in TOOL_MAP:
+            # 调用工具并获取结果
+            tool_result = TOOL_MAP[tool_name](image_path)
+        else:
+            raise ValueError(f"Unknown tool called: {tool_name}")
+        # 保存每个工具调用结果
+        results.append({
+            'role': 'tool',
+            'content': json.dumps({
+                'image_path': image_path,
+                f'{tool_name}':(tool_result),
+            }),
+            'tool_call_id': tool_call_id,
+        })
+# Prepare the chat completion payload
+    completion_payload = {
+        'model': 'gpt-4o',
+        'messages': [
+            {'role': 'system', 'content': 'You are a helpful assistant.'},
+            {
+                'role': 'user',
+                'content': [
+                    {
+                        'type': 'text',
+                        'text': prompt
+                    },
+                    {
+                        'type': 'image_url',
+                        'image_url': {
+                            'url': f'data:image/png;base64,{base64_image}'
+                        }
+                    }
+                ]
+            },
+            response.choices[0].message,
+            *results
+            ],
+    }
+# Generate new response
+    response = client.chat.completions.create(
+        model=completion_payload["model"],
+        messages=completion_payload["messages"],
+        response_format={ 'type': 'json_object' },
+        temperature=0
+    )
+    # 获取 GPT 生成的结果
+    gpt_output = json.loads(response.choices[0].message.content)
+    print(f"gpt_output1:{gpt_output}")
+    def get_reaction_full(image_path: str) -> dict:
+        '''
+        Returns a structured dictionary of reactions extracted from the image,
+        including reactants, conditions, and products, with their smiles, text, and bbox.
+        '''
+        image_file = image_path
+        raw_prediction = model1.predict_image_file(image_file, molscribe=True, ocr=True)
+        return raw_prediction
+    input2 = get_reaction_full(image_path)
+    def update_input_with_symbols(input1, input2, conversion_function):
+        symbol_mapping = {}
+        for key in ['reactants', 'products']:
+            for item in input1.get(key, []):
+                bbox = tuple(item['bbox'])  # 使用 bbox 作为唯一标识
+                symbol_mapping[bbox] = item['symbols']
+        for key in ['reactants', 'products']:
+            for item in input2.get(key, []):
+                bbox = tuple(item['bbox'])  # 获取 bbox 作为匹配键
+                # 如果 bbox 存在于 input1 的映射中，则更新 symbols
+                if bbox in symbol_mapping:
+                    updated_symbols = symbol_mapping[bbox]
+                    item['symbols'] = updated_symbols
+                    # 更新 atoms 的 atom_symbol
+                    if 'atoms' in item:
+                        atoms = item['atoms']
+                        if len(atoms) != len(updated_symbols):
+                            print(f"Warning: Mismatched symbols and atoms in bbox {bbox}")
+                        else:
+                            for atom, symbol in zip(atoms, updated_symbols):
+                                atom['atom_symbol'] = symbol
+                    # 如果 coords 和 edges 存在，调用转换函数生成新的 smiles 和 molfile
+                    if 'coords' in item and 'edges' in item:
+                        coords = item['coords']
+                        edges = item['edges']
+                        new_smiles, new_molfile, _ = conversion_function(coords, updated_symbols, edges)
+                        # 替换旧的 smiles 和 molfile
+                        item['smiles'] = new_smiles
+                        item['molfile'] = new_molfile
+        return input2
+    updated_data = [update_input_with_symbols(gpt_output, input2[0], _convert_graph_to_smiles)]
+    print(f"updated_reaction_data:{updated_data}")
+    return updated_data

main.py ADDED Viewed

	@@ -0,0 +1,546 @@

+import sys
+import torch
+import json
+from chemietoolkit import ChemIEToolkit,utils
+import cv2
+from openai import AzureOpenAI
+import numpy as np
+from PIL import Image
+import json
+from get_molecular_agent import process_reaction_image_with_multiple_products_and_text_correctR
+from get_reaction_agent import get_reaction_withatoms_correctR
+import sys
+from rxnscribe import RxnScribe
+import json
+import base64
+model = ChemIEToolkit(device=torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
+ckpt_path = "./pix2seq_reaction_full.ckpt"
+model1 = RxnScribe(ckpt_path, device=torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
+device = torch.device(('cuda' if torch.cuda.is_available() else 'cpu'))
+import os
+with open('api_key.txt', 'r') as api_key_file:
+    API_KEY = api_key_file.read()
+def parse_coref_data_with_fallback(data):
+    bboxes = data["bboxes"]
+    corefs = data["corefs"]
+    paired_indices = set()
+    # 先处理有 coref 配对的
+    results = []
+    for idx1, idx2 in corefs:
+        smiles_entry = bboxes[idx1] if "smiles" in bboxes[idx1] else bboxes[idx2]
+        text_entry = bboxes[idx2] if "text" in bboxes[idx2] else bboxes[idx1]
+        smiles = smiles_entry.get("smiles", "")
+        texts = text_entry.get("text", [])
+        results.append({
+            "smiles": smiles,
+            "texts": texts
+        })
+        # 记录下哪些 SMILES 被配对过了
+        paired_indices.add(idx1)
+        paired_indices.add(idx2)
+    # 处理未配对的 SMILES（补充进来）
+    for idx, entry in enumerate(bboxes):
+        if "smiles" in entry and idx not in paired_indices:
+            results.append({
+                "smiles": entry["smiles"],
+                "texts": ["There is no label or failed to detect, please recheck the image again"]
+            })
+    return results
+def get_multi_molecular_text_to_correct(image_path: str) -> list:
+    '''Returns a list of reactions extracted from the image.'''
+    # 打开图像文件
+    image = Image.open(image_path).convert('RGB')
+    # 将图像作为输入传递给模型
+    #coref_results = process_reaction_image_with_multiple_products_and_text_correctR(image_path)
+    coref_results = model.extract_molecule_corefs_from_figures([image])
+    for item in coref_results:
+        for bbox in item.get("bboxes", []):
+            for key in ["category", "bbox", "molfile", "symbols", 'atoms', "bonds", 'category_id', 'score', 'corefs',"coords","edges"]: #'atoms'
+                bbox.pop(key, None)  # 安全地移除键
+    data = coref_results[0]
+    parsed = parse_coref_data_with_fallback(data)
+    print(f"coref_results:{json.dumps(parsed)}")
+    return json.dumps(parsed)
+def get_reaction(image_path: str) -> dict:
+    '''
+    Returns a structured dictionary of reactions extracted from the image,
+    including only reactants, conditions, and products with their smiles, bbox, or text.
+    '''
+    image_file = image_path
+    #raw_prediction = model1.predict_image_file(image_file, molscribe=True, ocr=True)
+    raw_prediction = get_reaction_withatoms_correctR(image_path)
+    # Ensure raw_prediction is treated as a list directly
+    structured_output = {}
+    for section_key in ['reactants', 'conditions', 'products']:
+        if section_key in raw_prediction[0]:
+            structured_output[section_key] = []
+            for item in raw_prediction[0][section_key]:
+                if section_key in ['reactants', 'products']:
+                    # Extract smiles and bbox for molecules
+                    structured_output[section_key].append({
+                        "smiles": item.get("smiles", ""),
+                        "bbox": item.get("bbox", [])
+                    })
+                elif section_key == 'conditions':
+                    # Extract text and bbox for conditions
+                    structured_output[section_key].append({
+                        "text": item.get("text", []),
+                        "bbox": item.get("bbox", []),
+                        "smiles": item.get("smiles", []),
+                    })
+    return structured_output
+def process_reaction_image(image_path: str) -> dict:
+    """
+    Args:
+        image_path (str): 图像文件路径。
+    Returns:
+        dict: 整理后的反应数据，包括反应物、产物和反应模板。
+    """
+    # 配置 API Key 和 Azure Endpoint
+    api_key = os.getenv("CHEMEAGLE_API_KEY")
+    if not api_key:
+        raise RuntimeError("Missing CHEMEAGLE_API_KEY environment variable")
+    azure_endpoint = "https://hkust.azure-api.net"  # 替换为实际的 Azure Endpoint
+    model = ChemIEToolkit(device=torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
+    client = AzureOpenAI(
+        api_key=api_key,
+        api_version='2024-06-01',
+        azure_endpoint=azure_endpoint
+    )
+    # 加载图像并编码为 Base64
+    def encode_image(image_path: str):
+        with open(image_path, "rb") as image_file:
+            return base64.b64encode(image_file.read()).decode('utf-8')
+    base64_image = encode_image(image_path)
+    # GPT 工具调用配置
+    tools = [
+        {
+            'type': 'function',
+            'function': {
+                'name': 'get_multi_molecular_text_to_correct',
+                'description': 'Extracts the SMILES string and text coref from molecular images.',
+                'parameters': {
+                    'type': 'object',
+                    'properties': {
+                        'image_path': {
+                            'type': 'string',
+                            'description': 'Path to the reaction image.'
+                        }
+                    },
+                    'required': ['image_path'],
+                    'additionalProperties': False
+                }
+            }
+        },
+        {
+        'type': 'function',
+        'function': {
+            'name': 'get_reaction',
+            'description': 'Get a list of reactions from a reaction image. A reaction contains data of the reactants, conditions, and products.',
+            'parameters': {
+                'type': 'object',
+                'properties': {
+                    'image_path': {
+                        'type': 'string',
+                        'description': 'The path to the reaction image.',
+                    },
+                },
+                'required': ['image_path'],
+                'additionalProperties': False,
+            },
+        },
+            },
+    ]
+    # 提供给 GPT 的消息内容
+    with open('./prompt.txt', 'r') as prompt_file:
+        prompt = prompt_file.read()
+    messages = [
+        {'role': 'system', 'content': 'You are a helpful assistant.'},
+        {
+            'role': 'user',
+            'content': [
+                {'type': 'text', 'text': prompt},
+                {'type': 'image_url', 'image_url': {'url': f'data:image/png;base64,{base64_image}'}}
+            ]
+        }
+    ]
+    # 调用 GPT 接口
+    response = client.chat.completions.create(
+    model = 'gpt-4o',
+    temperature = 0,
+    response_format={ 'type': 'json_object' },
+    messages = [
+        {'role': 'system', 'content': 'You are a helpful assistant.'},
+        {
+            'role': 'user',
+            'content': [
+                {
+                    'type': 'text',
+                    'text': prompt
+                },
+                {
+                    'type': 'image_url',
+                    'image_url': {
+                        'url': f'data:image/png;base64,{base64_image}'
+                    }
+                }
+            ]},
+    ],
+    tools = tools)
+# Step 1: 工具映射表
+    TOOL_MAP = {
+        'get_multi_molecular_text_to_correct': get_multi_molecular_text_to_correct,
+        'get_reaction': get_reaction
+    }
+    # Step 2: 处理多个工具调用
+    tool_calls = response.choices[0].message.tool_calls
+    results = []
+    # 遍历每个工具调用
+    for tool_call in tool_calls:
+        tool_name = tool_call.function.name
+        tool_arguments = tool_call.function.arguments
+        tool_call_id = tool_call.id
+        tool_args = json.loads(tool_arguments)
+        if tool_name in TOOL_MAP:
+            # 调用工具并获取结果
+            tool_result = TOOL_MAP[tool_name](image_path)
+        else:
+            raise ValueError(f"Unknown tool called: {tool_name}")
+        # 保存每个工具调用结果
+        results.append({
+            'role': 'tool',
+            'content': json.dumps({
+                'image_path': image_path,
+                f'{tool_name}':(tool_result),
+            }),
+            'tool_call_id': tool_call_id,
+        })
+# Prepare the chat completion payload
+    completion_payload = {
+        'model': 'gpt-4o',
+        'messages': [
+            {'role': 'system', 'content': 'You are a helpful assistant.'},
+            {
+                'role': 'user',
+                'content': [
+                    {
+                        'type': 'text',
+                        'text': prompt
+                    },
+                    {
+                        'type': 'image_url',
+                        'image_url': {
+                            'url': f'data:image/png;base64,{base64_image}'
+                        }
+                    }
+                ]
+            },
+            response.choices[0].message,
+            *results
+            ],
+    }
+# Generate new response
+    response = client.chat.completions.create(
+        model=completion_payload["model"],
+        messages=completion_payload["messages"],
+        response_format={ 'type': 'json_object' },
+        temperature=0
+    )
+    # 获取 GPT 生成的结果
+    gpt_output = json.loads(response.choices[0].message.content)
+    print(gpt_output)
+    image = Image.open(image_path).convert('RGB')
+    image_np = np.array(image)
+    # reaction_results = model.extract_reactions_from_figures([image_np])
+    coref_results = model.extract_molecule_corefs_from_figures([image_np])
+    reaction_results = get_reaction_withatoms_correctR(image_path)[0]
+    reaction = {
+    "reactants": reaction_results.get('reactants', []),
+    "conditions": reaction_results.get('conditions', []),
+    "products": reaction_results.get('products', [])
+    }
+    reaction_results = [{"reactions": [reaction]}]
+    print(reaction_results)
+    #coref_results = process_reaction_image_with_multiple_products_and_text_correctR(image_path)
+    # 定义更新工具输出的函数
+    def extract_smiles_details(smiles_data, raw_details):
+        smiles_details = {}
+        for smiles in smiles_data:
+            for detail in raw_details:
+                for bbox in detail.get('bboxes', []):
+                    if bbox.get('smiles') == smiles:
+                        smiles_details[smiles] = {
+                            'category': bbox.get('category'),
+                            'bbox': bbox.get('bbox'),
+                            'category_id': bbox.get('category_id'),
+                            'score': bbox.get('score'),
+                            'molfile': bbox.get('molfile'),
+                            'atoms': bbox.get('atoms'),
+                            'bonds': bbox.get('bonds'),
+                        }
+                        break
+        return smiles_details
+# 获取结果
+    smiles_details = extract_smiles_details(gpt_output, coref_results)
+    reactants_array = []
+    products = []
+    for reactant in reaction_results[0]['reactions'][0]['reactants']:
+        if 'smiles' in reactant:
+            print(f"SMILES:{reactant['smiles']}")
+            #print(reactant)
+            reactants_array.append(reactant['smiles'])
+    for product in reaction_results[0]['reactions'][0]['products']:
+        #print(product['smiles'])
+        #print(product)
+        products.append(product['smiles'])
+    # 输出结果
+    #import pprint
+    #pprint.pprint(smiles_details)
+        # 整理反应数据
+    backed_out = utils.backout_without_coref(reaction_results, coref_results, gpt_output, smiles_details, model.molscribe)
+    backed_out.sort(key=lambda x: x[2])
+    extracted_rxns = {}
+    for reactants, products_, label in backed_out:
+        extracted_rxns[label] = {'reactants': reactants, 'products': products_}
+    toadd = {
+        "reaction_template": {
+            "reactants": reactants_array,
+            "products": products
+        },
+        "reactions": extracted_rxns,
+        "original_molecule_list": gpt_output
+    }
+# 按标签排序
+    sorted_keys = sorted(toadd["reactions"].keys())
+    toadd["reactions"] = {i: toadd["reactions"][i] for i in sorted_keys}
+    print(toadd)
+    return toadd
+def ChemEagle(image_path: str) -> dict:
+    """
+    输入化学反应图像路径，通过 GPT 模型和 TOOLS 提取反应信息并返回整理后的反应数据。
+    Args:
+        image_path (str): 图像文件路径。
+    Returns:
+        dict: 整理后的反应数据，包括反应物、产物和反应模板。
+    """
+    # 配置 API Key 和 Azure Endpoint
+    api_key = os.getenv("CHEMEAGLE_API_KEY")
+    if not api_key:
+        raise RuntimeError("Missing CHEMEAGLE_API_KEY environment variable")
+    azure_endpoint = "https://hkust.azure-api.net"  # 替换为实际的 Azure Endpoint
+    model = ChemIEToolkit(device=torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
+    client = AzureOpenAI(
+        api_key=api_key,
+        api_version='2024-06-01',
+        azure_endpoint=azure_endpoint
+    )
+    # 加载图像并编码为 Base64
+    def encode_image(image_path: str):
+        with open(image_path, "rb") as image_file:
+            return base64.b64encode(image_file.read()).decode('utf-8')
+    base64_image = encode_image(image_path)
+    # GPT 工具调用配置
+    tools = [
+        {
+        'type': 'function',
+        'function': {
+            'name': 'process_reaction_image',
+            'description': 'get the reaction data of the reaction diagram and get SMILES strings of every detailed reaction in reaction diagram and the table, and the original molecular list.',
+            'parameters': {
+                'type': 'object',
+                'properties': {
+                    'image_path': {
+                        'type': 'string',
+                        'description': 'The path to the reaction image.',
+                    },
+                },
+                'required': ['image_path'],
+                'additionalProperties': False,
+            },
+        },
+            },
+    ]
+    # 提供给 GPT 的消息内容
+    with open('./prompt_final_simple_version.txt', 'r') as prompt_file:
+        prompt = prompt_file.read()
+    messages = [
+        {'role': 'system', 'content': 'You are a helpful assistant.'},
+        {
+            'role': 'user',
+            'content': [
+                {'type': 'text', 'text': prompt},
+                {'type': 'image_url', 'image_url': {'url': f'data:image/png;base64,{base64_image}'}}
+            ]
+        }
+    ]
+    # 调用 GPT 接口
+    response = client.chat.completions.create(
+    model = 'gpt-4o',
+    temperature = 0,
+    response_format={ 'type': 'json_object' },
+    messages = [
+        {'role': 'system', 'content': 'You are a helpful assistant.'},
+        {
+            'role': 'user',
+            'content': [
+                {
+                    'type': 'text',
+                    'text': prompt
+                },
+                {
+                    'type': 'image_url',
+                    'image_url': {
+                        'url': f'data:image/png;base64,{base64_image}'
+                    }
+                }
+            ]},
+    ],
+    tools = tools)
+# Step 1: 工具映射表
+    TOOL_MAP = {
+        'process_reaction_image': process_reaction_image
+    }
+    # Step 2: 处理多个工具调用
+    tool_calls = response.choices[0].message.tool_calls
+    results = []
+    # 遍历每个工具调用
+    for tool_call in tool_calls:
+        tool_name = tool_call.function.name
+        tool_arguments = tool_call.function.arguments
+        tool_call_id = tool_call.id
+        tool_args = json.loads(tool_arguments)
+        if tool_name in TOOL_MAP:
+            # 调用工具并获取结果
+            tool_result = TOOL_MAP[tool_name](image_path)
+        else:
+            raise ValueError(f"Unknown tool called: {tool_name}")
+        # 保存每个工具调用结果
+        results.append({
+            'role': 'tool',
+            'content': json.dumps({
+                'image_path': image_path,
+                f'{tool_name}':(tool_result),
+            }),
+            'tool_call_id': tool_call_id,
+        })
+# Prepare the chat completion payload
+    completion_payload = {
+        'model': 'gpt-4o',
+        'messages': [
+            {'role': 'system', 'content': 'You are a helpful assistant.'},
+            {
+                'role': 'user',
+                'content': [
+                    {
+                        'type': 'text',
+                        'text': prompt
+                    },
+                    {
+                        'type': 'image_url',
+                        'image_url': {
+                            'url': f'data:image/png;base64,{base64_image}'
+                        }
+                    }
+                ]
+            },
+            response.choices[0].message,
+            *results
+            ],
+    }
+# Generate new response
+    response = client.chat.completions.create(
+        model=completion_payload["model"],
+        messages=completion_payload["messages"],
+        response_format={ 'type': 'json_object' },
+        temperature=0
+    )
+    # 获取 GPT 生成的结果
+    gpt_output = json.loads(response.choices[0].message.content)
+    print(gpt_output)
+    return gpt_output

main_Rgroup_debug.ipynb ADDED Viewed

	@@ -0,0 +1,993 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "import torch\n",
+    "import json\n",
+    "from chemietoolkit import ChemIEToolkit\n",
+    "import cv2\n",
+    "from PIL import Image\n",
+    "import json\n",
+    "model = ChemIEToolkit(device=torch.device('cpu')) \n",
+    "from get_molecular_agent import process_reaction_image_with_multiple_products_and_text\n",
+    "from get_reaction_agent import get_reaction_withatoms\n",
+    "from get_reaction_agent import get_full_reaction\n",
+    "\n",
+    "\n",
+    "# 定义函数，接受多个图像路径并返回反应列表\n",
+    "def get_multi_molecular(image_path: str) -> list:\n",
+    "    '''Returns a list of reactions extracted from the image.'''\n",
+    "    # 打开图像文件\n",
+    "    image = Image.open(image_path).convert('RGB')\n",
+    "    \n",
+    "    # 将图像作为输入传递给模型\n",
+    "    coref_results = model.extract_molecule_corefs_from_figures([image])\n",
+    "    \n",
+    "    for item in coref_results:\n",
+    "        for bbox in item.get(\"bboxes\", []):\n",
+    "            for key in [\"category\", \"molfile\", \"symbols\", 'atoms', \"bonds\", 'category_id', 'score', 'corefs',\"coords\",\"edges\"]: #'atoms'\n",
+    "                bbox.pop(key, None)  # 安全地移除键\n",
+    "    print(json.dumps(coref_results))\n",
+    "    # 返回反应列表，使用 json.dumps 进行格式化\n",
+    "    \n",
+    "    return json.dumps(coref_results)\n",
+    "\n",
+    "def get_multi_molecular_text_to_correct(image_path: str) -> list:\n",
+    "    '''Returns a list of reactions extracted from the image.'''\n",
+    "    # 打开图像文件\n",
+    "    image = Image.open(image_path).convert('RGB')\n",
+    "    \n",
+    "    # 将图像作为输入传递给模型\n",
+    "    coref_results = model.extract_molecule_corefs_from_figures([image])\n",
+    "    #coref_results = process_reaction_image_with_multiple_products_and_text(image_path)\n",
+    "    for item in coref_results:\n",
+    "        for bbox in item.get(\"bboxes\", []):\n",
+    "            for key in [\"category\", \"bbox\", \"molfile\", \"symbols\", 'atoms', \"bonds\", 'category_id', 'score', 'corefs',\"coords\",\"edges\"]: #'atoms'\n",
+    "                bbox.pop(key, None)  # 安全地移除键\n",
+    "    print(json.dumps(coref_results))\n",
+    "    # 返回反应列表，使用 json.dumps 进行格式化\n",
+    "    \n",
+    "    return json.dumps(coref_results)\n",
+    "\n",
+    "def get_multi_molecular_text_to_correct_withatoms(image_path: str) -> list:\n",
+    "    '''Returns a list of reactions extracted from the image.'''\n",
+    "    # 打开图像文件\n",
+    "    image = Image.open(image_path).convert('RGB')\n",
+    "    \n",
+    "    # 将图像作为输入传递给模型\n",
+    "    #coref_results = model.extract_molecule_corefs_from_figures([image])\n",
+    "    coref_results = process_reaction_image_with_multiple_products_and_text(image_path)\n",
+    "    for item in coref_results:\n",
+    "        for bbox in item.get(\"bboxes\", []):\n",
+    "            for key in [\"molfile\", 'atoms', \"bonds\", 'category_id', 'score', 'corefs',\"coords\",\"edges\"]: #'atoms'\n",
+    "                bbox.pop(key, None)  # 安全地移除键\n",
+    "    print(json.dumps(coref_results))\n",
+    "    # 返回反应列表，使用 json.dumps 进行格式化\n",
+    "    return json.dumps(coref_results)\n",
+    "\n",
+    "#get_multi_molecular_text_to_correct('./acs.joc.2c00176 example 1.png')\n",
+    "\n",
+    "import sys\n",
+    "#sys.path.append('./RxnScribe-main/')\n",
+    "import torch\n",
+    "from rxnscribe import RxnScribe\n",
+    "import json\n",
+    "\n",
+    "ckpt_path = \"./pix2seq_reaction_full.ckpt\"\n",
+    "model1 = RxnScribe(ckpt_path, device=torch.device('cpu'))\n",
+    "device = torch.device('cpu')\n",
+    "\n",
+    "def get_reaction(image_path: str) -> dict:\n",
+    "    '''\n",
+    "    Returns a structured dictionary of reactions extracted from the image,\n",
+    "    including reactants, conditions, and products, with their smiles, text, and bbox.\n",
+    "    '''\n",
+    "    image_file = image_path\n",
+    "    #raw_prediction = model1.predict_image_file(image_file, molscribe=True, ocr=True)\n",
+    "    raw_prediction = get_reaction_withatoms(image_path)\n",
+    "\n",
+    "    # Ensure raw_prediction is treated as a list directly\n",
+    "    structured_output = {}\n",
+    "    for section_key in ['reactants', 'conditions', 'products']:\n",
+    "        if section_key in raw_prediction[0]:\n",
+    "            structured_output[section_key] = []\n",
+    "            for item in raw_prediction[0][section_key]:\n",
+    "                if section_key in ['reactants', 'products']:\n",
+    "                    # Extract smiles and bbox for molecules\n",
+    "                    structured_output[section_key].append({\n",
+    "                        \"smiles\": item.get(\"smiles\", \"\"),\n",
+    "                        \"bbox\": item.get(\"bbox\", [])\n",
+    "                    })\n",
+    "                elif section_key == 'conditions':\n",
+    "                    # Extract smiles, text, and bbox for conditions\n",
+    "                    condition_data = {\"bbox\": item.get(\"bbox\", [])}\n",
+    "                    if \"smiles\" in item:\n",
+    "                        condition_data[\"smiles\"] = item.get(\"smiles\", \"\")\n",
+    "                    if \"text\" in item:\n",
+    "                        condition_data[\"text\"] = item.get(\"text\", [])\n",
+    "                    structured_output[section_key].append(condition_data)\n",
+    "    print(f\"structured_output:{structured_output}\")\n",
+    "\n",
+    "    return structured_output\n",
+    "\n",
+    "\n",
+    "\n",
+    "\n",
+    "import base64\n",
+    "import torch\n",
+    "import json\n",
+    "from PIL import Image\n",
+    "import numpy as np\n",
+    "from chemietoolkit import ChemIEToolkit, utils\n",
+    "from openai import AzureOpenAI\n",
+    "\n",
+    "def process_reaction_image_with_multiple_products(image_path: str) -> dict:\n",
+    "    \"\"\"\n",
+    "    Args:\n",
+    "        image_path (str): 图像文件路径。\n",
+    "\n",
+    "    Returns:\n",
+    "        dict: 整理后的反应数据，包括反应物、产物和反应模板。\n",
+    "    \"\"\"\n",
+    "    # 配置 API Key 和 Azure Endpoint\n",
+    "    api_key = \"b038da96509b4009be931e035435e022\"  # 替换为实际的 API Key\n",
+    "    azure_endpoint = \"https://hkust.azure-api.net\"  # 替换为实际的 Azure Endpoint\n",
+    "    \n",
+    "\n",
+    "    model = ChemIEToolkit(device=torch.device('cpu'))\n",
+    "    client = AzureOpenAI(\n",
+    "        api_key=api_key,\n",
+    "        api_version='2024-06-01',\n",
+    "        azure_endpoint=azure_endpoint\n",
+    "    )\n",
+    "\n",
+    "    # 加载图像并编码为 Base64\n",
+    "    def encode_image(image_path: str):\n",
+    "        with open(image_path, \"rb\") as image_file:\n",
+    "            return base64.b64encode(image_file.read()).decode('utf-8')\n",
+    "\n",
+    "    base64_image = encode_image(image_path)\n",
+    "\n",
+    "    # GPT 工具调用配置\n",
+    "    tools = [\n",
+    "        {\n",
+    "            'type': 'function',\n",
+    "            'function': {\n",
+    "                'name': 'get_multi_molecular_text_to_correct',\n",
+    "                'description': 'Extracts the SMILES string and text coref from molecular images.',\n",
+    "                'parameters': {\n",
+    "                    'type': 'object',\n",
+    "                    'properties': {\n",
+    "                        'image_path': {\n",
+    "                            'type': 'string',\n",
+    "                            'description': 'Path to the reaction image.'\n",
+    "                        }\n",
+    "                    },\n",
+    "                    'required': ['image_path'],\n",
+    "                    'additionalProperties': False\n",
+    "                }\n",
+    "            }\n",
+    "        },\n",
+    "        {\n",
+    "        'type': 'function',\n",
+    "        'function': {\n",
+    "            'name': 'get_reaction',\n",
+    "            'description': 'Get a list of reactions from a reaction image. A reaction contains data of the reactants, conditions, and products.',\n",
+    "            'parameters': {\n",
+    "                'type': 'object',\n",
+    "                'properties': {\n",
+    "                    'image_path': {\n",
+    "                        'type': 'string',\n",
+    "                        'description': 'The path to the reaction image.',\n",
+    "                    },\n",
+    "                },\n",
+    "                'required': ['image_path'],\n",
+    "                'additionalProperties': False,\n",
+    "            },\n",
+    "        },\n",
+    "            },\n",
+    "    ]\n",
+    "\n",
+    "    # 提供给 GPT 的消息内容\n",
+    "    with open('./prompt.txt', 'r') as prompt_file:\n",
+    "        prompt = prompt_file.read()\n",
+    "    messages = [\n",
+    "        {'role': 'system', 'content': 'You are a helpful assistant.'},\n",
+    "        {\n",
+    "            'role': 'user',\n",
+    "            'content': [\n",
+    "                {'type': 'text', 'text': prompt},\n",
+    "                {'type': 'image_url', 'image_url': {'url': f'data:image/png;base64,{base64_image}'}}\n",
+    "            ]\n",
+    "        }\n",
+    "    ]\n",
+    "\n",
+    "    # 调用 GPT 接口\n",
+    "    response = client.chat.completions.create(\n",
+    "    model = 'gpt-4o',\n",
+    "    temperature = 0,\n",
+    "    response_format={ 'type': 'json_object' },\n",
+    "    messages = [\n",
+    "        {'role': 'system', 'content': 'You are a helpful assistant.'},\n",
+    "        {\n",
+    "            'role': 'user',\n",
+    "            'content': [\n",
+    "                {\n",
+    "                    'type': 'text',\n",
+    "                    'text': prompt\n",
+    "                },\n",
+    "                {\n",
+    "                    'type': 'image_url',\n",
+    "                    'image_url': {\n",
+    "                        'url': f'data:image/png;base64,{base64_image}'\n",
+    "                    }\n",
+    "                }\n",
+    "            ]},\n",
+    "    ],\n",
+    "    tools = tools)\n",
+    "    \n",
+    "# Step 1: 工具映射表\n",
+    "    TOOL_MAP = {\n",
+    "        'get_multi_molecular_text_to_correct': get_multi_molecular_text_to_correct,\n",
+    "        'get_reaction': get_reaction\n",
+    "    }\n",
+    "\n",
+    "    # Step 2: 处理多个工具调用\n",
+    "    tool_calls = response.choices[0].message.tool_calls\n",
+    "    results = []\n",
+    "\n",
+    "    # 遍历每个工具调用\n",
+    "    for tool_call in tool_calls:\n",
+    "        tool_name = tool_call.function.name\n",
+    "        tool_arguments = tool_call.function.arguments\n",
+    "        tool_call_id = tool_call.id\n",
+    "        \n",
+    "        tool_args = json.loads(tool_arguments)\n",
+    "        \n",
+    "        if tool_name in TOOL_MAP:\n",
+    "            # 调用工具并获取结果\n",
+    "            tool_result = TOOL_MAP[tool_name](image_path)\n",
+    "        else:\n",
+    "            raise ValueError(f\"Unknown tool called: {tool_name}\")\n",
+    "        \n",
+    "        # 保存每个工具调用结果\n",
+    "        results.append({\n",
+    "            'role': 'tool',\n",
+    "            'content': json.dumps({\n",
+    "                'image_path': image_path,\n",
+    "                f'{tool_name}':(tool_result),\n",
+    "            }),\n",
+    "            'tool_call_id': tool_call_id,\n",
+    "        })\n",
+    "\n",
+    "\n",
+    "# Prepare the chat completion payload\n",
+    "    completion_payload = {\n",
+    "        'model': 'gpt-4o',\n",
+    "        'messages': [\n",
+    "            {'role': 'system', 'content': 'You are a helpful assistant.'},\n",
+    "            {\n",
+    "                'role': 'user',\n",
+    "                'content': [\n",
+    "                    {\n",
+    "                        'type': 'text',\n",
+    "                        'text': prompt\n",
+    "                    },\n",
+    "                    {\n",
+    "                        'type': 'image_url',\n",
+    "                        'image_url': {\n",
+    "                            'url': f'data:image/png;base64,{base64_image}'\n",
+    "                        }\n",
+    "                    }\n",
+    "                ]\n",
+    "            },\n",
+    "            response.choices[0].message,\n",
+    "            *results\n",
+    "            ],\n",
+    "    }\n",
+    "\n",
+    "# Generate new response\n",
+    "    response = client.chat.completions.create(\n",
+    "        model=completion_payload[\"model\"],\n",
+    "        messages=completion_payload[\"messages\"],\n",
+    "        response_format={ 'type': 'json_object' },\n",
+    "        temperature=0\n",
+    "    )\n",
+    "\n",
+    "\n",
+    "    \n",
+    "    # 获取 GPT 生成的结果\n",
+    "    gpt_output = json.loads(response.choices[0].message.content)\n",
+    "    print(f\"gptout:{gpt_output}\")\n",
+    "\n",
+    "    image = Image.open(image_path).convert('RGB')\n",
+    "    image_np = np.array(image)\n",
+    "\n",
+    "    #########################\n",
+    "    #reaction_results = model.extract_reactions_from_figures([image_np])\n",
+    "    reaction_results = get_reaction_withatoms(image_path)[0]\n",
+    "    reactions = []\n",
+    "    \n",
+    "    # 将 reactants 和 products 转换为 reactions\n",
+    "    for reactants, conditions, products in zip(reaction_results.get('reactants', []), reaction_results.get('conditions', []), reaction_results.get('products', [])):\n",
+    "        reaction = {\n",
+    "            \"reactants\": [reactants],\n",
+    "            \"conditions\": [conditions],\n",
+    "            \"products\": [products]\n",
+    "        }\n",
+    "        reactions.append(reaction)\n",
+    "    reaction_results = [{\"reactions\": reactions}]\n",
+    "    #coref_results = model.extract_molecule_corefs_from_figures([image_np])\n",
+    "    coref_results = process_reaction_image_with_multiple_products_and_text(image_path)\n",
+    "    ########################\n",
+    "\n",
+    "    # 定义更新工具输出的函数\n",
+    "    def extract_smiles_details(smiles_data, raw_details):\n",
+    "        smiles_details = {}\n",
+    "        for smiles in smiles_data:\n",
+    "            for detail in raw_details:\n",
+    "                for bbox in detail.get('bboxes', []):\n",
+    "                    if bbox.get('smiles') == smiles:\n",
+    "                        smiles_details[smiles] = {\n",
+    "                            'category': bbox.get('category'),\n",
+    "                            'bbox': bbox.get('bbox'),\n",
+    "                            'category_id': bbox.get('category_id'),\n",
+    "                            'score': bbox.get('score'),\n",
+    "                            'molfile': bbox.get('molfile'),\n",
+    "                            'atoms': bbox.get('atoms'),\n",
+    "                            'bonds': bbox.get('bonds')\n",
+    "                        }\n",
+    "                        break\n",
+    "        return smiles_details\n",
+    "\n",
+    "# 获取结果\n",
+    "    smiles_details = extract_smiles_details(gpt_output, coref_results)\n",
+    "\n",
+    "    reactants_array = []\n",
+    "    products = []\n",
+    "\n",
+    "    for reactant in reaction_results[0]['reactions'][0]['reactants']:\n",
+    "    #for reactant in reaction_results[0]['reactions'][0]['reactants']:\n",
+    "        if 'smiles' in reactant:\n",
+    "            #print(reactant['smiles'])\n",
+    "            #print(reactant)\n",
+    "            reactants_array.append(reactant['smiles'])\n",
+    "\n",
+    "    for product in reaction_results[0]['reactions'][0]['products']:\n",
+    "        #print(product['smiles'])\n",
+    "        #print(product)\n",
+    "        products.append(product['smiles'])\n",
+    "    # 输出结果\n",
+    "    #import pprint\n",
+    "    #pprint.pprint(smiles_details)\n",
+    "\n",
+    "        # 整理反应数据\n",
+    "    try:\n",
+    "        backed_out = utils.backout_without_coref(reaction_results, coref_results, gpt_output, smiles_details, model.molscribe)\n",
+    "        backed_out.sort(key=lambda x: x[2])\n",
+    "        extracted_rxns = {}\n",
+    "        for reactants, products_, label in backed_out:\n",
+    "            extracted_rxns[label] = {'reactants': reactants, 'products': products_}\n",
+    "\n",
+    "        toadd = {\n",
+    "            \"reaction_template\": {\n",
+    "                \"reactants\": reactants_array,\n",
+    "                \"products\": products\n",
+    "            },\n",
+    "            \"reactions\": extracted_rxns\n",
+    "        }\n",
+    "        \n",
+    "\n",
+    "    # 按标签排序\n",
+    "        sorted_keys = sorted(toadd[\"reactions\"].keys())\n",
+    "        toadd[\"reactions\"] = {i: toadd[\"reactions\"][i] for i in sorted_keys}\n",
+    "        original_molecular_list = {'Original molecular list': gpt_output}\n",
+    "        final_data= toadd.copy()\n",
+    "        final_data.update(original_molecular_list)\n",
+    "    except:\n",
+    "        #pass\n",
+    "        final_data = {'Original molecular list': gpt_output}\n",
+    "\n",
+    "    print(final_data)\n",
+    "    return final_data\n",
+    " \n",
+    "\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # image_path = './example/Replace/99.jpg'\n",
+    "# # result = process_reaction_image(image_path)\n",
+    "# # print(json.dumps(result, indent=4))\n",
+    "# image_path = './example/example1/replace/Nesting/283.jpg'\n",
+    "# image = Image.open(image_path).convert('RGB')\n",
+    "# image_np = np.array(image)\n",
+    "\n",
+    "# # input1 = get_multi_molecular_text_to_correct_withatoms('./example/example1/replace/Nesting/283.jpg')\n",
+    "# # input2 = get_reaction('./example/example1/replace/Nesting/283.jpg')\n",
+    "# # print(input1)\n",
+    "# # print(input2)\n",
+    "# #reaction_results = model.extract_reactions_from_figures([image_np])\n",
+    "# coorf = model.extract_molecule_corefs_from_figures([image_np])\n",
+    "# print(coorf)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import base64\n",
+    "import torch\n",
+    "import json\n",
+    "from PIL import Image\n",
+    "import numpy as np\n",
+    "from openai import AzureOpenAI\n",
+    "\n",
+    "def process_reaction_image_final(image_path: str) -> dict:\n",
+    "    \"\"\"\n",
+    "\n",
+    "    Args:\n",
+    "        image_path (str): 图像文件路径。\n",
+    "\n",
+    "    Returns:\n",
+    "        dict: 整理后的反应数据，包括反应物、产物和反应模板。\n",
+    "    \"\"\"\n",
+    "    # 配置 API Key 和 Azure Endpoint\n",
+    "    api_key = \"b038da96509b4009be931e035435e022\"  # 替换为实际的 API Key\n",
+    "    azure_endpoint = \"https://hkust.azure-api.net\"  # 替换为实际的 Azure Endpoint\n",
+    "    \n",
+    "\n",
+    "    model = ChemIEToolkit(device=torch.device('cpu'))\n",
+    "    client = AzureOpenAI(\n",
+    "        api_key=api_key,\n",
+    "        api_version='2024-06-01',\n",
+    "        azure_endpoint=azure_endpoint\n",
+    "    )\n",
+    "\n",
+    "    # 加载图像并编码为 Base64\n",
+    "    def encode_image(image_path: str):\n",
+    "        with open(image_path, \"rb\") as image_file:\n",
+    "            return base64.b64encode(image_file.read()).decode('utf-8')\n",
+    "\n",
+    "    base64_image = encode_image(image_path)\n",
+    "\n",
+    "    # GPT 工具调用配置\n",
+    "    tools = [\n",
+    "        {\n",
+    "            'type': 'function',\n",
+    "            'function': {\n",
+    "                'name': 'get_multi_molecular_text_to_correct',\n",
+    "                'description': 'Extracts the SMILES string and text coref from molecular sub-images from a reaction image and ready for further process.',\n",
+    "                'parameters': {\n",
+    "                    'type': 'object',\n",
+    "                    'properties': {\n",
+    "                        'image_path': {\n",
+    "                            'type': 'string',\n",
+    "                            'description': 'Path to the reaction image.'\n",
+    "                        }\n",
+    "                    },\n",
+    "                    'required': ['image_path'],\n",
+    "                    'additionalProperties': False\n",
+    "                }\n",
+    "            }\n",
+    "        },\n",
+    "        {\n",
+    "        'type': 'function',\n",
+    "        'function': {\n",
+    "            'name': 'get_reaction',\n",
+    "            'description': 'Get a list of reactions from a reaction image. A reaction contains data of the reactants, conditions, and products.',\n",
+    "            'parameters': {\n",
+    "                'type': 'object',\n",
+    "                'properties': {\n",
+    "                    'image_path': {\n",
+    "                        'type': 'string',\n",
+    "                        'description': 'The path to the reaction image.',\n",
+    "                    },\n",
+    "                },\n",
+    "                'required': ['image_path'],\n",
+    "                'additionalProperties': False,\n",
+    "            },\n",
+    "        },\n",
+    "            },\n",
+    "\n",
+    "        \n",
+    "\n",
+    "            {\n",
+    "        'type': 'function',\n",
+    "        'function': {\n",
+    "            'name': 'process_reaction_image_with_multiple_products',\n",
+    "            'description': 'process the reaction image that contains a multiple products table. Get a list of reactions from the reaction image, Inculding the reaction template and detailed reaction with detailed R-group information.',\n",
+    "            'parameters': {\n",
+    "                'type': 'object',\n",
+    "                'properties': {\n",
+    "                    'image_path': {\n",
+    "                        'type': 'string',\n",
+    "                        'description': 'The path to the reaction image.',\n",
+    "                    },\n",
+    "                },\n",
+    "                'required': ['image_path'],\n",
+    "                'additionalProperties': False,\n",
+    "            },\n",
+    "        },\n",
+    "            },\n",
+    "\n",
+    "            {\n",
+    "        'type': 'function',\n",
+    "        'function': {\n",
+    "            'name': 'get_full_reaction',\n",
+    "            'description': 'Get a list of reactions from a reaction image without any tables. A reaction contains data of the reactants, conditions, and products.',\n",
+    "            'parameters': {\n",
+    "                'type': 'object',\n",
+    "                'properties': {\n",
+    "                    'image_path': {\n",
+    "                        'type': 'string',\n",
+    "                        'description': 'The path to the reaction image.',\n",
+    "                    },\n",
+    "                },\n",
+    "                'required': ['image_path'],\n",
+    "                'additionalProperties': False,\n",
+    "            },\n",
+    "        },\n",
+    "            },\n",
+    "\n",
+    "        {\n",
+    "        'type': 'function',\n",
+    "        'function': {\n",
+    "            'name': 'get_multi_molecular',\n",
+    "            'description': 'Extracts the SMILES string and text coref from a molecular image without any reactions',\n",
+    "            'parameters': {\n",
+    "                'type': 'object',\n",
+    "                'properties': {\n",
+    "                    'image_path': {\n",
+    "                        'type': 'string',\n",
+    "                        'description': 'The path to the reaction image.',\n",
+    "                    },\n",
+    "                },\n",
+    "                'required': ['image_path'],\n",
+    "                'additionalProperties': False,\n",
+    "            },\n",
+    "        },\n",
+    "            },\n",
+    "    ]\n",
+    "\n",
+    "    # 提供给 GPT 的消息内容\n",
+    "    with open('./prompt_final.txt', 'r') as prompt_file:\n",
+    "        prompt = prompt_file.read()\n",
+    "    messages = [\n",
+    "        {'role': 'system', 'content': 'You are a helpful assistant.'},\n",
+    "        {\n",
+    "            'role': 'user',\n",
+    "            'content': [\n",
+    "                {'type': 'text', 'text': prompt},\n",
+    "                {'type': 'image_url', 'image_url': {'url': f'data:image/png;base64,{base64_image}'}}\n",
+    "            ]\n",
+    "        }\n",
+    "    ]\n",
+    "\n",
+    "    # 调用 GPT 接口\n",
+    "    response = client.chat.completions.create(\n",
+    "    model = 'gpt-4o',\n",
+    "    temperature = 0,\n",
+    "    response_format={ 'type': 'json_object' },\n",
+    "    messages = [\n",
+    "        {'role': 'system', 'content': 'You are a helpful assistant.'},\n",
+    "        {\n",
+    "            'role': 'user',\n",
+    "            'content': [\n",
+    "                {\n",
+    "                    'type': 'text',\n",
+    "                    'text': prompt\n",
+    "                },\n",
+    "                {\n",
+    "                    'type': 'image_url',\n",
+    "                    'image_url': {\n",
+    "                        'url': f'data:image/png;base64,{base64_image}'\n",
+    "                    }\n",
+    "                }\n",
+    "            ]},\n",
+    "    ],\n",
+    "    tools = tools)\n",
+    "    \n",
+    "# Step 1: 工具映射表\n",
+    "    TOOL_MAP = {\n",
+    "        'get_multi_molecular_text_to_correct': get_multi_molecular_text_to_correct,\n",
+    "        'get_reaction': get_reaction,\n",
+    "        'process_reaction_image_with_multiple_products':process_reaction_image_with_multiple_products,\n",
+    "\n",
+    "        'get_full_reaction': get_full_reaction,\n",
+    "        'get_multi_molecular':get_multi_molecular,\n",
+    "    }\n",
+    "\n",
+    "    # Step 2: 处理多个工具调用\n",
+    "    tool_calls = response.choices[0].message.tool_calls\n",
+    "    results = []\n",
+    "\n",
+    "    # 遍历每个工具调用\n",
+    "    for tool_call in tool_calls:\n",
+    "        tool_name = tool_call.function.name\n",
+    "        tool_arguments = tool_call.function.arguments\n",
+    "        tool_call_id = tool_call.id\n",
+    "        \n",
+    "        tool_args = json.loads(tool_arguments)\n",
+    "        \n",
+    "        if tool_name in TOOL_MAP:\n",
+    "            # 调用工具并获取结果\n",
+    "            tool_result = TOOL_MAP[tool_name](image_path)\n",
+    "        else:\n",
+    "            raise ValueError(f\"Unknown tool called: {tool_name}\")\n",
+    "        \n",
+    "        # 保存每个工具调用结果\n",
+    "        results.append({\n",
+    "            'role': 'tool',\n",
+    "            'content': json.dumps({\n",
+    "                'image_path': image_path,\n",
+    "                f'{tool_name}':(tool_result),\n",
+    "            }),\n",
+    "            'tool_call_id': tool_call_id,\n",
+    "        })\n",
+    "\n",
+    "\n",
+    "# Prepare the chat completion payload\n",
+    "    completion_payload = {\n",
+    "        'model': 'gpt-4o',\n",
+    "        'messages': [\n",
+    "            {'role': 'system', 'content': 'You are a helpful assistant.'},\n",
+    "            {\n",
+    "                'role': 'user',\n",
+    "                'content': [\n",
+    "                    {\n",
+    "                        'type': 'text',\n",
+    "                        'text': prompt\n",
+    "                    },\n",
+    "                    {\n",
+    "                        'type': 'image_url',\n",
+    "                        'image_url': {\n",
+    "                            'url': f'data:image/png;base64,{base64_image}'\n",
+    "                        }\n",
+    "                    }\n",
+    "                ]\n",
+    "            },\n",
+    "            response.choices[0].message,\n",
+    "            *results\n",
+    "            ],\n",
+    "    }\n",
+    "\n",
+    "# Generate new response\n",
+    "    response = client.chat.completions.create(\n",
+    "        model=completion_payload[\"model\"],\n",
+    "        messages=completion_payload[\"messages\"],\n",
+    "        response_format={ 'type': 'json_object' },\n",
+    "        temperature=0\n",
+    "    )\n",
+    "\n",
+    "\n",
+    "    \n",
+    "    # 获取 GPT 生成的结果\n",
+    "    gpt_output = json.loads(response.choices[0].message.content)\n",
+    "    print(gpt_output)\n",
+    "    return gpt_output\n",
+    "\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "image_path = './data/bowen-4/2.png'\n",
+    "result = process_reaction_image_final(image_path)\n",
+    "print(json.dumps(result, indent=4))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# def get_reaction(image_path: str) -> list:\n",
+    "#     '''Returns a list of reactions extracted from the image.'''\n",
+    "#     image_file = image_path\n",
+    "#     return json.dumps(model1.predict_image_file(image_file, molscribe=True, ocr=True))\n",
+    "\n",
+    "# reaction_output = get_reaction('./pdf/2/2_image_3_1.png')\n",
+    "# print(reaction_output)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import fitz  # PyMuPDF\n",
+    "from core import run_visualheist\n",
+    "import base64\n",
+    "from openai import AzureOpenAI\n",
+    "\n",
+    "def full_pdf_extraction_pipeline_with_history(pdf_path,\n",
+    "                                  output_dir,\n",
+    "                                  api_key,\n",
+    "                                  azure_endpoint,\n",
+    "                                  model=\"gpt-4o\",\n",
+    "                                  model_size=\"large\"):\n",
+    "    \"\"\"\n",
+    "    Full pipeline: from PDF to GPT-annotated related text.\n",
+    "    Extracts markdown + figures + reaction data from a PDF and calls GPT-4o to annotate them.\n",
+    "\n",
+    "    Args:\n",
+    "        pdf_path (str): Path to input PDF file.\n",
+    "        output_dir (str): Directory to save results.\n",
+    "        api_key (str): Azure OpenAI API key.\n",
+    "        azure_endpoint (str): Azure OpenAI endpoint.\n",
+    "        model (str): GPT model name (default \"gpt-4o\").\n",
+    "        model_size (str): VisualHeist model size (\"base\", \"large\", etc).\n",
+    "\n",
+    "    Returns:\n",
+    "        List of GPT-generated annotated related-text JSONs.\n",
+    "    \"\"\"\n",
+    "\n",
+    "\n",
+    "    os.makedirs(output_dir, exist_ok=True)\n",
+    "\n",
+    "    # Step 1: Extract Markdown text\n",
+    "    doc = fitz.open(pdf_path)\n",
+    "    md_text = \"\"\n",
+    "    for i, page in enumerate(doc, start=1):\n",
+    "        md_text += f\"\\n\\n## = Page {i} =\\n\\n\" + page.get_text()\n",
+    "    filename = os.path.splitext(os.path.basename(pdf_path))[0]\n",
+    "    md_path = os.path.join(output_dir, f\"{filename}.md\")\n",
+    "    with open(md_path, \"w\", encoding=\"utf-8\") as f:\n",
+    "        f.write(md_text.strip())\n",
+    "    print(f\"[✓] Markdown saved to: {md_path}\")\n",
+    "\n",
+    "    # Step 2: Extract figures using VisualHeist\n",
+    "    run_visualheist(pdf_dir=pdf_path, model_size=model_size, image_dir=output_dir)\n",
+    "    print(f\"[✓] Figures extracted to: {output_dir}\")\n",
+    "\n",
+    "    # Step 3: Parse figures to JSON\n",
+    "    image_data = []\n",
+    "    known_molecules = []\n",
+    "\n",
+    "    for fname in sorted(os.listdir(output_dir)):\n",
+    "        if fname.endswith(\".png\"):\n",
+    "            img_path = os.path.join(output_dir, fname)\n",
+    "            try:\n",
+    "                result = process_reaction_image_final(img_path)\n",
+    "                result[\"image_name\"] = fname\n",
+    "                image_data.append(result)\n",
+    "            except Exception as e:\n",
+    "                print(f\"[!] Failed on {fname}: {e}\")\n",
+    "                new_mols_json = get_multi_molecular_text_to_correct(img_path)\n",
+    "                new_mols = json.loads(new_mols_json)\n",
+    "                for m in new_mols:\n",
+    "                    if m[\"smiles\"] not in {km[\"smiles\"] for km in known_molecules}:\n",
+    "                        known_molecules.append(m)\n",
+    "\n",
+    "\n",
+    "    json_path = os.path.join(output_dir, f\"{filename}_reaction_data.json\")\n",
+    "    with open(json_path, \"w\", encoding=\"utf-8\") as f:\n",
+    "        json.dump(image_data, f, indent=2, ensure_ascii=False)\n",
+    "    print(f\"[✓] Reaction data saved to: {json_path}\")\n",
+    "\n",
+    "    # Step 4: Call Azure GPT-4 for annotation\n",
+    "    client = AzureOpenAI(\n",
+    "        api_key=api_key,\n",
+    "        api_version=\"2024-06-01\",\n",
+    "        azure_endpoint=azure_endpoint\n",
+    "    )\n",
+    "\n",
+    "    prompt = \"\"\"\n",
+    "You are a text-mining assistant for chemistry papers. Your task is to find the most relevant 1–3 sentences in a research article that describe a given figure or scheme.\n",
+    "\n",
+    "You will be given:\n",
+    "- A block of text extracted from the article (in Markdown format).\n",
+    "- The extracted structured data from one image (including its title and list of molecules or reactions).\n",
+    "\n",
+    "Your task is:\n",
+    "1. Match the image with sentences that are most relevant to it. Use clues like the figure/scheme/table number in the title, or molecule/reaction labels (e.g., 1a, 2b, 3).\n",
+    "2. Extract up to 3 short sentences that best describe or mention the contents of the image.\n",
+    "3. In these sentences, label any molecule or reaction identifiers (like “1a”, “2b”) with their role based on context: [reactant], [product], etc.\n",
+    "4. Also label experimental conditions with their roles:\n",
+    "   - Percent values like “85%” as [yield]\n",
+    "   - Temperatures like “100 °C” as [temperature]\n",
+    "   - Time durations like “24 h”, “20 min” as [time]\n",
+    "5. Do **not** label chemical position numbers (e.g., in \"3-trifluoromethyl\", \"1,2,4-triazole\").\n",
+    "6. Do not repeat any labels. Only label each item once per sentence.\n",
+    "\n",
+    "Output format:\n",
+    "{\n",
+    "  \"title\": \"<title from image>\",\n",
+    "  \"related-text\": [\n",
+    "    \"Sentence with roles like 1a[reactant], 2c[product], 100[temperature] °C.\",\n",
+    "    ...\n",
+    "  ]\n",
+    "}\n",
+    "\"\"\"\n",
+    "\n",
+    "    annotated_results = []\n",
+    "    for item in image_data:\n",
+    "        img_path = os.path.join(output_dir, item[\"image_name\"])\n",
+    "        with open(img_path, \"rb\") as f:\n",
+    "            base64_image = base64.b64encode(f.read()).decode(\"utf-8\")\n",
+    "\n",
+    "        combined_input = f\"\"\"\n",
+    "## Image Structured Data:\n",
+    "{json.dumps(item, indent=2)}\n",
+    "\n",
+    "## Article Text:\n",
+    "{md_text}\n",
+    "\"\"\"\n",
+    "\n",
+    "        response = client.chat.completions.create(\n",
+    "            model=model,\n",
+    "            temperature=0,\n",
+    "            response_format=\"json\",\n",
+    "            messages=[\n",
+    "                {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n",
+    "                {\n",
+    "                    \"role\": \"user\",\n",
+    "                    \"content\": [\n",
+    "                        {\"type\": \"text\", \"text\": prompt + \"\\n\\n\" + combined_input},\n",
+    "                        {\n",
+    "                            \"type\": \"image_url\",\n",
+    "                            \"image_url\": {\n",
+    "                                \"url\": f\"data:image/png;base64,{base64_image}\"\n",
+    "                            }\n",
+    "                        }\n",
+    "                    ]\n",
+    "                }\n",
+    "            ]\n",
+    "        )\n",
+    "        annotated_results.append(json.loads(response.choices[0].message.content))\n",
+    "\n",
+    "    # Optionally save output\n",
+    "    with open(os.path.join(output_dir, f\"{filename}_annotated_related_text.json\"), \"w\", encoding=\"utf-8\") as f:\n",
+    "        json.dump(annotated_results, f, indent=2, ensure_ascii=False)\n",
+    "    print(f\"[✓] Annotated related-text saved.\")\n",
+    "\n",
+    "    return annotated_results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "image_path = './data/example/example1/replace/Nesting/283.jpg'\n",
+    "#image_path = './pdf/2/2_image_1_1.png'\n",
+    "result = process_reaction_image_final(image_path)\n",
+    "print(json.dumps(result, indent=4))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# import os\n",
+    "\n",
+    "# image_folder = './example/example1/replace/regular/'  # 图片文件夹路径\n",
+    "# output_folder = './batches_final_repalce_regular/'  # 保存每批结果的文件夹路径\n",
+    "# batch_size = 3  # 每批处理文件数量\n",
+    "\n",
+    "# # 创建保存批次结果的文件夹（如果不存在）\n",
+    "# os.makedirs(output_folder, exist_ok=True)\n",
+    "\n",
+    "# # 获取所有图片文件并按字母顺序排序\n",
+    "# all_files = sorted([f for f in os.listdir(image_folder) if f.endswith('.jpg')])\n",
+    "\n",
+    "# # 获取已完成的批次\n",
+    "# completed_batches = [\n",
+    "#     int(f.split('_')[1].split('.')[0]) for f in os.listdir(output_folder) if f.startswith('batch_') and f.endswith('.json')\n",
+    "# ]\n",
+    "# completed_batches = sorted(completed_batches)  # 确保按顺序排序\n",
+    "\n",
+    "# # 从指定批次开始（如果有未完成批次）\n",
+    "# start_batch = (completed_batches[-1] + 1) if completed_batches else 1\n",
+    "\n",
+    "# # 将文件分批并从指定批次开始\n",
+    "# for batch_index in range((start_batch - 1) * batch_size, len(all_files), batch_size):\n",
+    "#     batch_files = all_files[batch_index:batch_index + batch_size]\n",
+    "#     results = []\n",
+    "\n",
+    "#     batch_number = batch_index // batch_size + 1\n",
+    "#     print(f\"正在按字母顺序处理第 {batch_number} 批，共 {len(batch_files)} 张图片...\")\n",
+    "    \n",
+    "#     for file_name in batch_files:\n",
+    "#         image_path = os.path.join(image_folder, file_name)\n",
+    "#         print(f\"处理文件 {file_name}...\")\n",
+    "        \n",
+    "#         try:\n",
+    "#             # 处理单个图片\n",
+    "#             result = process_reaction_image_final(image_path)\n",
+    "            \n",
+    "#             # 确保结果是字典\n",
+    "#             if isinstance(result, dict):\n",
+    "#                 # 添加文件名信息\n",
+    "#                 result_with_filename = {\n",
+    "#                     \"file_name\": file_name,\n",
+    "#                     **result\n",
+    "#                 }\n",
+    "#                 results.append(result_with_filename)\n",
+    "#                 print(result_with_filename)\n",
+    "#             else:\n",
+    "#                 print(f\"文件 {file_name} 的处理结果不是字典，跳过。\")\n",
+    "        \n",
+    "#         except Exception as e:\n",
+    "#             print(f\"处理文件 {file_name} 时出错: {e}\")\n",
+    "\n",
+    "#     # 保存当前批次结果\n",
+    "#     batch_output_path = os.path.join(output_folder, f'batch_{batch_number}.json')\n",
+    "#     with open(batch_output_path, 'w', encoding='utf-8') as json_file:\n",
+    "#         json.dump(results, json_file, ensure_ascii=False, indent=4)\n",
+    "\n",
+    "#     print(f\"第 {batch_number} 批处理完成，结果保存到 {batch_output_path}\")\n",
+    "\n",
+    "# print(\"所有批次处理完成！\")\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import rdkit\n",
+    "from rdkit import Chem\n",
+    "from rdkit.Chem import Draw\n",
+    "\n",
+    "Draw.MolToImage(Chem.MolFromSmiles('[Si](C)(C)OC(c1ccccc1)(c1ccccc1)C1CCC2=NN(Cc3ccccc3)=CN21'))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "openchemie",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

molscribe/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .interface import MolScribe

molscribe/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (189 Bytes). View file

molscribe/__pycache__/augment.cpython-310.pyc ADDED Viewed

Binary file (8.98 kB). View file