File size: 4,141 Bytes
dfddde4
de1b844
e174db6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
de1b844
0cec9e8
0329404
 
 
de1b844
60d4859
c6925d2
ae3d933
e174db6
 
de1b844
 
 
60d4859
e174db6
 
 
 
 
 
 
 
 
 
90b6f18
 
60d4859
0715c3e
60d4859
28944a9
0329404
2619751
de1b844
 
 
7524b2b
28944a9
60d4859
d7c4ce9
28944a9
d7c4ce9
 
dfddde4
60d4859
 
94f6279
de1b844
e174db6
 
de1b844
 
 
 
 
0329404
de1b844
 
60d4859
 
 
 
 
 
 
 
 
 
 
 
 
 
0329404
60d4859
 
 
 
7524b2b
60d4859
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0329404
60d4859
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import gradio as gr
from huggingface_hub import InferenceClient
import pytesseract
from PIL import Image
from pypdf import PdfReader
import ocrmypdf
import os

# Image to Text

def fn_image_to_text(input_image):
    return pytesseract.image_to_string(Image.open(input_image))

# PDF to Text

def fn_pdf_to_text(input_pdf):
    reader = PdfReader(input_pdf)
    
    output_pdf = ""
    for page in reader.pages:
        output_pdf+=page.extract_text()
  
    image_count = 0
    for page in reader.pages:
        image_count += len(page.images)

    if image_count > 0 and len(output_pdf) < 1000:
        input_pdf_ocr = input_pdf.replace(".pdf", " - OCR.pdf")
        ocrmypdf.ocr(input_pdf, input_pdf_ocr, force_ocr=True)
    
        reader = PdfReader(input_pdf_ocr)
        output_pdf = ""
        for page in reader.pages:
            output_pdf+=page.extract_text()

        os.remove(input_pdf_ocr)
  
    return output_pdf

# Inference

model_text = "meta-llama/Llama-3.2-3B-Instruct"
model_vision = "meta-llama/Llama-3.2-11B-Vision-Instruct"

client = InferenceClient()

def fn_text(
    prompt,
    history,
    input,
    #system_prompt,
    max_tokens,
    temperature,
    top_p,
):
    if input:
        if os.path.splitext(input)[1].lower() in [".png", ".jpg", ".jpeg"]:
            output = fn_image_to_text(input)
        if os.path.splitext(input)[1].lower() == ".pdf":
            output = fn_pdf_to_text(input)
    else:
        output = ""
    
    messages = [{"role": "system", "content": [{"type": "text", "text": output}]}]
    #messages = [{"role": "system", "content": [{"type": "text", "text": system_prompt}]}]
    history.append(messages[0])
    
    messages.append({"role": "user", "content": [{"type": "text", "text": prompt}]})
    history.append(messages[1])
    
    stream = client.chat.completions.create(
        model = model_text,
        messages = history,
        max_tokens = max_tokens,
        temperature = temperature,
        top_p = top_p,
        stream = True,
    )
    
    chunks = []
    for chunk in stream:
        chunks.append(chunk.choices[0].delta.content or "")
        yield "".join(chunks)

app_text = gr.ChatInterface(
    fn = fn_text,
    type = "messages",
    additional_inputs = [
        gr.File(type="filepath", label="Input"),
        #gr.Textbox(value="You are a helpful assistant.", label="System Prompt"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max Tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-P"),
    ],
    title = "Meta Llama",
    description = model_text,
)

def fn_vision(
    prompt,
    image_url,
    #system_prompt,
    max_tokens,
    temperature,
    top_p,
):
    messages = [{"role": "user", "content": [{"type": "text", "text": prompt}]}]
    
    if image_url:
        messages[0]["content"].append({"type": "image_url", "image_url": {"url": image_url}})
    
    stream = client.chat.completions.create(
        model = model_vision,
        messages = messages,
        max_tokens = max_tokens,
        temperature = temperature,
        top_p = top_p,
        stream = True,
    )
    
    chunks = []
    for chunk in stream:
        chunks.append(chunk.choices[0].delta.content or "")
        yield "".join(chunks)

app_vision = gr.Interface(
    fn = fn_vision,
    inputs = [
        gr.Textbox(label="Prompt"),
        gr.Textbox(label="Image URL")
    ],
    outputs = [
        gr.Textbox(label="Output")
    ],
    additional_inputs = [
        #gr.Textbox(value="You are a helpful assistant.", label="System Prompt"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max Tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-P"),
    ],
    title = "Meta Llama",
    description = model_vision,
)

app = gr.TabbedInterface(
    [app_text, app_vision],
    ["Text", "Vision"]
).launch()

#if __name__ == "__main__":
#    app.launch()