File size: 7,350 Bytes
47af600 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
import streamlit as st # Streamlit for the web app interface
from groq import Groq # Groq for handling the API requests
from PIL import Image # PIL for image processing (opening, displaying images)
import os # For file and environment variable handling
from dotenv import load_dotenv # To load environment variables from .env file
import base64 # To encode images into base64 format
import io # To handle in-memory byte buffers
# Load environment variables from .env file
load_dotenv()
# Retrieve the Groq API key from the environment variable
api_key = os.getenv("GROQ_API_KEY") # Retrieves the API key stored in the .env file
# Initialize the Groq client with the API key
client = Groq(api_key=api_key) # Creates a Groq client object using the API key
# Page configuration for Streamlit
st.set_page_config(
page_title="Llama OCR", # Set the title of the app
page_icon="π¦", # Set the page icon (Llama emoji)
layout="wide", # Use a wide layout for the app
initial_sidebar_state="expanded" # Set the initial state of the sidebar to expanded
)
# Function to handle main content of the page
def main_content():
st.title("π¦ Llama OCR") # Display the main title
st.markdown('<p style="margin-top: -20px;">Extract structured text from images using Llama 3.2 Vision!</p>', unsafe_allow_html=True) # Display a description below the title with custom styling
st.markdown("---") # Horizontal line to separate sections
col1, col2 = st.columns([6, 1]) # Create two columns: a large left column and a smaller right column for the clear button
with col2:
if st.button("Clear ποΈ"): # If the "Clear" button is clicked
if 'ocr_result' in st.session_state: # Check if OCR result exists in session state
del st.session_state['ocr_result'] # Delete the OCR result from session state
st.rerun() # Rerun the app to reset everything
# Display OCR result in the main content section (if it exists)
if 'ocr_result' in st.session_state:
st.markdown("### π― **Extracted Text**") # Professional heading with a target emoji to make it stand out
st.markdown(st.session_state['ocr_result'], unsafe_allow_html=True) # Display the OCR result stored in session state
# Function to handle sidebar content
def sidebar_content():
with st.sidebar: # Everything inside this block will appear in the sidebar
st.header("π₯ Upload Image") # Sidebar header for the image upload section
# Display message if no image is uploaded
if 'ocr_result' not in st.session_state:
st.write("### Please upload an image to extract text.") # Instruction message to upload an image
uploaded_file = st.file_uploader("Choose an image...", type=['png', 'jpg', 'jpeg']) # Upload an image file with supported types
if uploaded_file: # If an image is uploaded
display_uploaded_image(uploaded_file) # Call function to display the uploaded image
# This button triggers the processing of the uploaded image to extract text
if uploaded_file and st.button("Extract Text π") and 'ocr_result' not in st.session_state:
with st.spinner("Processing image... Please wait."): # Show a spinner during image processing
process_image(uploaded_file) # Call the function to process the image and extract text
# If no image is uploaded or processed, clear the sidebar
if not uploaded_file and 'ocr_result' not in st.session_state:
st.sidebar.empty() # Ensures the sidebar is empty unless there is interaction
# Function to display the uploaded image
def display_uploaded_image(uploaded_file):
image = Image.open(uploaded_file) # Open the uploaded image using PIL
st.image(image, caption="Uploaded Image", use_container_width=True) # Display the image in the app with a caption and automatic width
# Function to encode the image into base64 format
def encode_image(uploaded_file):
image = Image.open(uploaded_file) # Open the uploaded image
buffered = io.BytesIO() # Create an in-memory byte buffer
image.save(buffered, format=image.format) # Save the image into the buffer
img_byte_array = buffered.getvalue() # Get the byte array of the image
return base64.b64encode(img_byte_array).decode('utf-8'), image.format # Return the base64 encoded image and its format
# Function to process the image and extract text using Groq API
def process_image(uploaded_file):
if uploaded_file: # Check if an image is uploaded
# Encode the uploaded image to base64 and retrieve the image format
base64_image, image_format = encode_image(uploaded_file)
# Determine the MIME type for the base64 encoded image
mime_type = f"image/{image_format.lower()}"
# Create a base64 URL for the image
base64_url = f"data:{mime_type};base64,{base64_image}"
# Start spinner while waiting for the API response
with st.spinner("Generating response... This may take a moment."):
try:
# Call the Groq API to extract text from the image
response = client.chat.completions.create(
model="llama-3.2-11b-vision-preview", # Specify the model to use (Llama 3.2 Vision)
messages=[
{
"role": "user", # Role of the message sender
"content": [ # The content of the message
{"type": "text", "text": "Analyze the text in the provided image. Extract all readable content "
"and present it in a structured Markdown format. Use headings, lists, "
"or code blocks as appropriate for clarity and organization."},
{
"type": "image_url", # Type of content: image
"image_url": {
"url": base64_url, # The base64 URL of the uploaded image
},
},
]
}
],
temperature=0.2, # Set the temperature to 0.1 for less randomness and more focused results
max_tokens=200, # Limit the maximum number of tokens (words) to 200 for shorter responses
top_p=0.5, # Set top_p to 0.5 to control the diversity of generated text
stream=False # Disable streaming of results
)
# Access the content of the response from the Groq API
message_content = response.choices[0].message.content
st.session_state['ocr_result'] = message_content # Store the extracted text in session state
except Exception as e: # Catch any errors during the image processing
st.error(f"Error during text extraction: {e}") # Display the error message in the app
# Running the Streamlit app
if __name__ == "__main__":
main_content() # Display the main content (title, OCR result)
sidebar_content() # Display the sidebar content (image upload and processing)
|