Spaces:

tohoku-nlp
/

Sketch2Diagram

Sleeping

App Files Files Community

DaddyDaniel commited on May 1

Commit

e8a63bc

2 Parent(s): f4b5b0a 847829a

Merge branch 'streamlit'

Browse files

# Conflicts:
# main_page.py
# requirements.txt
# sketch2diagram.py

Files changed (9) hide show

.dockerignore +1 -0
Dockerfile +34 -0
NLP_Group_logo.png +0 -0
app.py +15 -0
main_page.py +6 -0
qwen2_inference.py +108 -0
requirements.txt +11 -2
sketch2diagram.py +45 -13
util.py +26 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ .venv

Dockerfile ADDED Viewed

	@@ -0,0 +1,34 @@

+FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
+# Set environment variables to reduce interactive prompts
+ENV DEBIAN_FRONTEND=noninteractive
+# Install dependencies
+RUN apt-get update && apt-get install -y \
+    python3.10 \
+    python3-pip \
+    git \
+    texlive-latex-base \
+    texlive-latex-extra \
+    texlive-fonts-recommended \
+    texlive-latex-recommended \
+    latexmk \
+    poppler-utils \
+    && rm -rf /var/lib/apt/lists/*
+# Copy the files
+WORKDIR /app
+COPY requirements.txt .
+RUN pip install --upgrade pip \
+    && pip install --no-cache-dir -r requirements.txt
+ENV PATH="/root/.local/bin:$PATH"
+ENV STREAMLIT_WATCHER_TYPE none
+RUN pip install --no-cache-dir https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.0.6/flash_attn-2.6.3+cu124torch2.6-cp310-cp310-linux_x86_64.whl
+COPY . .
+# Default command
+ENTRYPOINT ["streamlit", "run", "app.py"]

NLP_Group_logo.png ADDED Viewed

app.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import os
+import streamlit as st
+from PIL import Image
+logo_path = os.path.join(os.path.dirname(__file__), "NLP_Group_logo.png")
+logo = Image.open(logo_path)
+st.logo(logo, size="large")
+main_page = st.Page("main_page.py", title="Main Page", icon="🏠")
+sketch2diagram_page = st.Page("sketch2diagram.py", title="Sketch2Diagram", icon="🖼️")
+# Add pages to the main page
+pg = st.navigation([main_page, sketch2diagram_page])
+pg.run()

main_page.py CHANGED Viewed

@@ -3,3 +3,9 @@ import streamlit as st
 st.title("Tohoku NLP Group - Language and Information Science Laboratory ")
 st.write("Welcome to the Language and Information Science Laboratory!")
 st.write("We are working on various projects and research focused on Visual Language Models.")

 st.title("Tohoku NLP Group - Language and Information Science Laboratory ")
 st.write("Welcome to the Language and Information Science Laboratory!")
 st.write("We are working on various projects and research focused on Visual Language Models.")
+# Link to sketch2diagram page
+st.subheader("You can check out our models and demos here:")
+st.write("[Sketch2Diagram](sketch2diagram) - A model that generates TikZ code from sketches.")

qwen2_inference.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import os
+import streamlit as st
+import torch
+from PIL import Image
+from dotenv import load_dotenv
+from qwen_vl_utils import process_vision_info
+from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
+load_dotenv()
+HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_HUB_TOKEN")
+def print_gpu_memory(label, memory_allocated, memory_reserved):
+    if torch.cuda.is_available():
+        print("-----------------------------------")
+        print(f"{label} GPU Memory Usage:")
+        print(f"Allocated: {memory_allocated / 1024 ** 2:.2f} MB")
+        print(f"Cached: {memory_reserved / 1024 ** 2:.2f} MB")
+# Inference steps taken from https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct
+# @st.cache_resource
+def get_model(model_path):
+    try:
+        with st.spinner(f"Loading model {model_path}"):
+            # Load the model here
+            model_import = Qwen2VLForConditionalGeneration.from_pretrained(
+                model_path, torch_dtype="auto", device_map="auto",
+                attn_implementation="flash_attention_2",
+                token=HUGGINGFACE_TOKEN,
+            )
+            model_import = model_import.to("cuda")
+            size = {
+                "shortest_edge": 224,
+                "longest_edge": 1024,
+            }
+            processor_import = AutoProcessor.from_pretrained("itsumi-st/imgtikz_qwen2vl",
+                                                             size=size,
+                                                             min_pixels=256 * 256,
+                                                             max_pixels=1024 * 1024,
+                                                             token=HUGGINGFACE_TOKEN)
+            processor_import.tokenizer.padding_side = 'left'
+            return model_import, processor_import
+    except Exception as e:
+        st.error(f"Error loading model: {e}")
+        return None, None
+def run_inference(input_file, model_path, args):
+    model, processor = get_model(model_path)
+    if model is None or processor is None:
+        return "Error loading model."
+    # GPU Memory after model loading:
+    after_model_dump = (torch.cuda.memory_allocated(), torch.cuda.memory_reserved())
+    image = Image.open(input_file)
+    conversation = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": image},
+                {"type": "text", "text": "Please generate TikZ code to draw the diagram of the given image."}
+            ],
+        }
+    ]
+    text_prompt = processor.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)
+    image_input, video_inputs = process_vision_info(conversation)
+    inputs = processor(
+        text=[text_prompt],
+        images=image_input,
+        videos=video_inputs,
+        padding=True,
+        return_tensors="pt",
+    ).to("cuda")
+    # GPU Memory after input processing
+    after_input_dump = (torch.cuda.memory_allocated(), torch.cuda.memory_reserved())
+    output_ids = model.generate(**inputs,
+                                max_new_tokens=args['max_length'],
+                                do_sample=True,
+                                top_p=args['top_p'],
+                                top_k=args['top_k'],
+                                use_cache=True,
+                                num_return_sequences=1,
+                                pad_token_id=processor.tokenizer.pad_token_id,
+                                temperature=args['temperature']
+                                )
+    generated_ids = [
+        output_ids[len(input_ids):]
+        for input_ids, output_ids in zip(inputs.input_ids, output_ids)
+    ]
+    output_text = processor.batch_decode(
+        generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
+    )
+    # GPU Memory after generation
+    after_gen_dump = (torch.cuda.memory_allocated(), torch.cuda.memory_reserved())
+    print_gpu_memory("Before Model", after_model_dump[0], after_model_dump[1])
+    print_gpu_memory("After Input", after_input_dump[0], after_input_dump[1])
+    print_gpu_memory("After Generation", after_gen_dump[0], after_gen_dump[1])
+    return output_text

requirements.txt CHANGED Viewed

@@ -1,3 +1,12 @@
 streamlit~=1.43.2
-transformers~=4.50.0
-pillow~=11.1.0

 streamlit~=1.43.2
+torch==2.6.0
+torchvision==0.21.0
+torchaudio
+transformers==4.48.2
+qwen-vl-utils==0.0.10
+packaging
+accelerate==1.0.1
+requests
+pillow
+python-dotenv
+pdf2image

sketch2diagram.py CHANGED Viewed

@@ -1,12 +1,25 @@
 import streamlit as st
-from PIL import Image
-from main import get_model
 # Sidebar Setup
 st.sidebar.title("Model Configuration")
-inference_strat = st.sidebar.selectbox("Inference Strategy", ["Iterative", "Multi-candidate"],
-                                       help="Choose the inference strategy for the model. Iterative generates one candidate at a time until an output compiles, while Multi-candidate generates multiple candidates in parallel.")
 # Introduction Section
 st.title("Sketch2Diagram")
@@ -14,7 +27,6 @@ st.title("Sketch2Diagram")
 st.write("This is a runnable demo of ImgTikZ model introduced in the Sketch2Diagram paper.")
 st.write("Please refer to the [original paper](https://openreview.net/pdf?id=KvaDHPhhir) for more details.")
 st.write("The model is trained to convert sketches into TikZ code, which can be used to generate vectorized diagrams.")
-st.write(f"Inference Strategy: {inference_strat}")
 # User Input Section
 st.subheader("Upload your sketch")
@@ -25,22 +37,42 @@ input_method = st.selectbox("Input Method", ["Upload", "Camera"],
 input_file = None
 if input_method == "Camera":
     input_file = st.camera_input("Take a picture of your sketch")
-    # Implement camera input functionality here
 else:
     input_file = st.file_uploader("Upload an image of your sketch", type=["png", "jpg", "jpeg"])
 generate_command = None
 # Display the uploaded image
 if input_file is not None:
     st.image(input_file, caption="Uploaded Sketch")
     generate_command = st.button("Generate TikZ Code")
 if generate_command:
-    model = get_model()
-    image = Image.open(input_file)
     with st.spinner("Generating TikZ code..."):
-        output = model(image)
-    tikz_code = output[0]['generated_text']
-    st.subheader("Generated TikZ Code")
-    st.code(tikz_code, language='latex')

 import streamlit as st
+from pdf2image import convert_from_path
+from qwen2_inference import run_inference
+from util import compile_tikz_to_pdf
+args = {}
 # Sidebar Setup
 st.sidebar.title("Model Configuration")
+model_name = st.sidebar.selectbox("Model Name", ['Itsumi-st/Imgtikz_Qwen2vl', 'Qwen/Qwen2-VL-7B-Instruct'])
+args['inference_strat'] = st.sidebar.selectbox("Inference Strategy", ["Iterative", "Multi-candidate"],
+                                               help="Choose the inference strategy for the model. Iterative generates one candidate at a time until an output compiles, while Multi-candidate generates multiple candidates in parallel.")
+args['max_length'] = st.sidebar.slider("Max Length", 1, 5096, 2048,
+                                       help="Maximum length of the generated output. The model will generate text up to this length.")
+args['seed'] = st.sidebar.number_input("Seed", min_value=0, value=42, step=1)
+args['temperature'] = st.sidebar.slider("Temperature", 0.0, 1.0, 0.6, step=0.01,
+                                        help="Temperature parameter for sampling. Higher values result in more random outputs.")
+args['top_p'] = st.sidebar.slider("Top P", 0.0, 1.0, 1.0, step=0.01,
+                                  help="Top P sampling parameter. The model will sample from the top P percentage of the probability distribution.")
+args['top_k'] = st.sidebar.slider("Top K", 0, 100, 50, step=1,
+                                  help="Top K sampling parameter. The model will sample from the top K tokens with the highest probabilities.")
 # Introduction Section
 st.title("Sketch2Diagram")
 st.write("This is a runnable demo of ImgTikZ model introduced in the Sketch2Diagram paper.")
 st.write("Please refer to the [original paper](https://openreview.net/pdf?id=KvaDHPhhir) for more details.")
 st.write("The model is trained to convert sketches into TikZ code, which can be used to generate vectorized diagrams.")
 # User Input Section
 st.subheader("Upload your sketch")
 input_file = None
 if input_method == "Camera":
     input_file = st.camera_input("Take a picture of your sketch")
+    # todo: Implement camera input functionality here
 else:
     input_file = st.file_uploader("Upload an image of your sketch", type=["png", "jpg", "jpeg"])
+st.write(args)
 generate_command = None
 # Display the uploaded image
 if input_file is not None:
     st.image(input_file, caption="Uploaded Sketch")
     generate_command = st.button("Generate TikZ Code")
+# Run model inference
 if generate_command:
     with st.spinner("Generating TikZ code..."):
+        output = run_inference(input_file, model_name, args)[0]
+        pdf_file_path = compile_tikz_to_pdf(output)
+        if output and pdf_file_path:
+            st.success("TikZ code generated successfully!")
+            st.code(output, language='latex')
+            st.download_button(
+                label="Download LaTeX Code",
+                data=output,
+                file_name="output.tex",
+                mime="text/plain"
+            )
+            # st.image(pdf_file_path, caption="Generated Diagram", use_column_width=True)
+            with open(pdf_file_path, "rb") as f:
+                st.download_button(
+                    label="Download PDF",
+                    data=f.read(),  # ✅ this is the binary content
+                    file_name="output.pdf",
+                    mime="application/pdf"
+                )
+            images = convert_from_path(pdf_file_path)
+            st.image(images[0], caption="Generated Diagram", use_column_width=True)
+        else:
+            st.error("Failed to generate TikZ code.")

util.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import os
+import subprocess
+import tempfile
+def compile_tikz_to_pdf(tikz_code):
+    temp_dir = tempfile.mkdtemp()
+    tex_path = os.path.join(temp_dir, "output.tex")
+    pdf_path = os.path.join(temp_dir, "output.pdf")
+    with open(tex_path, "w") as f:
+        f.write(tikz_code)
+    try:
+        subprocess.run(
+            ["pdflatex", "-interaction=nonstopmode", tex_path],
+            cwd=temp_dir,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            check=True,
+        )
+        return pdf_path
+    except subprocess.CalledProcessError as e:
+        print("PDF compilation failed:", e)
+        return None