Update app.py
Browse files
app.py
CHANGED
@@ -1,38 +1,29 @@
|
|
1 |
-
import
|
|
|
|
|
2 |
import pandas as pd
|
3 |
-
import
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
"
|
8 |
-
|
9 |
-
"
|
10 |
-
|
11 |
-
|
12 |
-
"MCH": (27, 33),
|
13 |
-
"MCHC": (31, 36),
|
14 |
-
"Neutrophil %": (49, 74),
|
15 |
-
"Lymphocyte %": (26, 46),
|
16 |
-
"Monocyte %": (2, 12),
|
17 |
-
"Eosinophil %": (0, 5),
|
18 |
-
"Basophil %": (0, 2),
|
19 |
-
"Abs. Neutrophil": (2.0, 8.0),
|
20 |
-
"Abs. Lymphocyte": (1.2, 4.8),
|
21 |
-
"Abs. Monocyte": (0.0, 0.8),
|
22 |
-
"Abs. Eosinophil": (0.0, 0.5),
|
23 |
-
"Abs. Basophil": (0.0, 0.2),
|
24 |
-
}
|
25 |
|
26 |
def clean_and_parse_extracted_text(raw_text):
|
27 |
"""
|
28 |
Parse and clean the raw text to extract structured data.
|
29 |
"""
|
|
|
30 |
lines = raw_text.split("\n")
|
31 |
lines = [line.strip() for line in lines if line.strip()]
|
32 |
|
|
|
33 |
data = []
|
34 |
for line in lines:
|
35 |
-
# Match rows
|
36 |
match = re.match(
|
37 |
r"^(.*?)(\d+(\.\d+)?)(\s*-?\s*\d+(\.\d+)?\s*-?\s*\d+(\.\d+)?)?\s*([a-zA-Z/%]+)?\s*(H|L|Normal)?$",
|
38 |
line,
|
@@ -48,16 +39,9 @@ def clean_and_parse_extracted_text(raw_text):
|
|
48 |
else:
|
49 |
min_val = None
|
50 |
max_val = None
|
51 |
-
|
52 |
unit = match.group(7)
|
53 |
flag = "Normal" # Default flag
|
54 |
|
55 |
-
# Use default ranges if OCR fails to extract them
|
56 |
-
if min_val is None or max_val is None:
|
57 |
-
default_range = DEFAULT_RANGES.get(component)
|
58 |
-
if default_range:
|
59 |
-
min_val, max_val = default_range
|
60 |
-
|
61 |
# Determine the flag based on value and range
|
62 |
if min_val is not None and max_val is not None:
|
63 |
if value < min_val:
|
@@ -71,40 +55,49 @@ def clean_and_parse_extracted_text(raw_text):
|
|
71 |
# Create a DataFrame
|
72 |
df = pd.DataFrame(data, columns=["Component", "Your Value", "Min", "Max", "Units", "Flag"])
|
73 |
|
74 |
-
#
|
75 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
|
77 |
return df
|
78 |
|
79 |
-
|
|
|
80 |
"""
|
81 |
-
|
82 |
"""
|
83 |
-
|
84 |
-
import pytesseract
|
85 |
-
raw_text = pytesseract.image_to_string(image)
|
86 |
|
87 |
-
# Step 2: Parse and analyze the extracted text
|
88 |
-
df = clean_and_parse_extracted_text(raw_text)
|
89 |
|
90 |
-
|
91 |
-
|
|
|
92 |
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
outputs=gr.DataFrame(label="Blood Test Analysis"),
|
105 |
-
title="Blood Test Analyzer",
|
106 |
-
description="Upload an image of your blood test report to analyze the values and flag abnormalities.",
|
107 |
-
)
|
108 |
|
109 |
-
|
110 |
-
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from PIL import Image
|
3 |
+
import pytesseract
|
4 |
import pandas as pd
|
5 |
+
import re
|
6 |
+
|
7 |
+
|
8 |
+
def extract_text(image):
|
9 |
+
"""
|
10 |
+
Extract text from the image using Tesseract.
|
11 |
+
"""
|
12 |
+
return pytesseract.image_to_string(image)
|
13 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
def clean_and_parse_extracted_text(raw_text):
|
16 |
"""
|
17 |
Parse and clean the raw text to extract structured data.
|
18 |
"""
|
19 |
+
# Split the text into lines and clean up
|
20 |
lines = raw_text.split("\n")
|
21 |
lines = [line.strip() for line in lines if line.strip()]
|
22 |
|
23 |
+
# Identify and extract rows with valid components
|
24 |
data = []
|
25 |
for line in lines:
|
26 |
+
# Match rows containing numeric ranges and values
|
27 |
match = re.match(
|
28 |
r"^(.*?)(\d+(\.\d+)?)(\s*-?\s*\d+(\.\d+)?\s*-?\s*\d+(\.\d+)?)?\s*([a-zA-Z/%]+)?\s*(H|L|Normal)?$",
|
29 |
line,
|
|
|
39 |
else:
|
40 |
min_val = None
|
41 |
max_val = None
|
|
|
42 |
unit = match.group(7)
|
43 |
flag = "Normal" # Default flag
|
44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
# Determine the flag based on value and range
|
46 |
if min_val is not None and max_val is not None:
|
47 |
if value < min_val:
|
|
|
55 |
# Create a DataFrame
|
56 |
df = pd.DataFrame(data, columns=["Component", "Your Value", "Min", "Max", "Units", "Flag"])
|
57 |
|
58 |
+
# Fix misspellings and inconsistencies (if any known issues exist)
|
59 |
+
correction_map = {
|
60 |
+
"emoglobin": "Hemoglobin",
|
61 |
+
"ematocrit": "Hematocrit",
|
62 |
+
"% Platelet Count": "Platelet Count",
|
63 |
+
"ymphocyte %": "Lymphocyte %",
|
64 |
+
"L Differential Type Automated": "Differential Type",
|
65 |
+
}
|
66 |
+
df["Component"] = df["Component"].replace(correction_map)
|
67 |
|
68 |
return df
|
69 |
|
70 |
+
|
71 |
+
def display_results(df):
|
72 |
"""
|
73 |
+
Display the parsed data in a table format.
|
74 |
"""
|
75 |
+
st.dataframe(df, use_container_width=True)
|
|
|
|
|
76 |
|
|
|
|
|
77 |
|
78 |
+
# Streamlit app
|
79 |
+
st.title("Blood Report Analyzer")
|
80 |
+
st.write("Upload an image of a blood test report to analyze.")
|
81 |
|
82 |
+
uploaded_file = st.file_uploader("Upload Image", type=["png", "jpg", "jpeg"])
|
83 |
+
|
84 |
+
if uploaded_file is not None:
|
85 |
+
try:
|
86 |
+
# Load the image
|
87 |
+
image = Image.open(uploaded_file)
|
88 |
+
|
89 |
+
# Display the uploaded image
|
90 |
+
st.image(image, caption="Uploaded Image", use_container_width=True)
|
91 |
+
|
92 |
+
# Extract text from the image
|
93 |
+
extracted_text = extract_text(image)
|
94 |
+
|
95 |
+
# Parse the extracted text into a structured format
|
96 |
+
parsed_data = clean_and_parse_extracted_text(extracted_text)
|
97 |
|
98 |
+
# Display the structured data
|
99 |
+
st.subheader("Parsed Blood Test Results")
|
100 |
+
display_results(parsed_data)
|
|
|
|
|
|
|
|
|
101 |
|
102 |
+
except Exception as e:
|
103 |
+
st.error(f"An error occurred: {e}")
|