erayman09 commited on
Commit
4fd374a
·
verified ·
1 Parent(s): 45352c6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +88 -41
app.py CHANGED
@@ -1,49 +1,96 @@
1
  import streamlit as st
2
  from PIL import Image
3
  import pytesseract
4
- import re
5
  import pandas as pd
 
6
 
7
- # Streamlit App
8
- st.title("Blood Test Report Parser")
9
 
10
- # File uploader
11
- uploaded_file = st.file_uploader("Upload a blood test report image", type=["jpg", "jpeg", "png"])
 
 
 
12
 
13
- if uploaded_file is not None:
14
- # Display uploaded image
15
- image = Image.open(uploaded_file)
16
- st.image(image, caption="Uploaded Image", use_container_width=True)
17
-
18
- # Extract text using Tesseract OCR
19
- with st.spinner("Extracting text from image..."):
20
- extracted_text = pytesseract.image_to_string(image)
21
- st.text_area("Extracted Text", extracted_text, height=300)
22
-
23
- # Regex Pattern for Parsing
24
- pattern = r"(?P<component>[A-Za-z\s%]+?)\s+(?P<your_value>[\d.]+)\s+(?P<range>[\d.]+ - [\d.]+)\s+(?P<units>[\w/%]+)(?:\s+(?P<flag>[LH]))?"
25
-
26
- # Parse Extracted Text
27
  data = []
28
- for match in re.finditer(pattern, extracted_text):
29
- component = match.group("component").strip()
30
- your_value = float(match.group("your_value"))
31
- range_min, range_max = map(float, match.group("range").split(" - "))
32
- units = match.group("units")
33
- flag = match.group("flag") if match.group("flag") else "Normal"
34
- data.append({
35
- "Component": component,
36
- "Your Value": your_value,
37
- "Min": range_min,
38
- "Max": range_max,
39
- "Units": units,
40
- "Flag": flag
41
- })
42
-
43
- # Convert to DataFrame
44
- if data:
45
- df = pd.DataFrame(data)
46
- st.success("Parsed Data Successfully!")
47
- st.dataframe(df)
48
- else:
49
- st.error("No valid data found in the extracted text.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
  from PIL import Image
3
  import pytesseract
 
4
  import pandas as pd
5
+ import re
6
 
 
 
7
 
8
+ def extract_text(image):
9
+ """
10
+ Extract text from the image using Tesseract.
11
+ """
12
+ return pytesseract.image_to_string(image)
13
 
14
+
15
+ def clean_and_parse_extracted_text(raw_text):
16
+ """
17
+ Parse and clean the raw text to extract structured data.
18
+ """
19
+ # Split the text into lines and clean up
20
+ lines = raw_text.split("\n")
21
+ lines = [line.strip() for line in lines if line.strip()]
22
+
23
+ # Identify and extract rows with valid components
 
 
 
 
24
  data = []
25
+ for line in lines:
26
+ # Match rows containing numeric ranges and values
27
+ match = re.match(
28
+ r"^(.*?)(\d+(\.\d+)?)(\s*-?\s*\d+(\.\d+)?\s*-?\s*\d+(\.\d+)?)?\s*([a-zA-Z/%]+)?\s*(H|L|Normal)?$",
29
+ line,
30
+ )
31
+ if match:
32
+ component = match.group(1).strip()
33
+ value = float(match.group(2))
34
+ range_match = match.group(4)
35
+ if range_match:
36
+ ranges = re.findall(r"[\d.]+", range_match)
37
+ min_val = float(ranges[0])
38
+ max_val = float(ranges[1])
39
+ else:
40
+ min_val = None
41
+ max_val = None
42
+ unit = match.group(7)
43
+ flag = match.group(8) if match.group(8) else "Normal"
44
+
45
+ # Append structured data
46
+ data.append([component, value, min_val, max_val, unit, flag])
47
+
48
+ # Create a DataFrame
49
+ df = pd.DataFrame(data, columns=["Component", "Your Value", "Min", "Max", "Units", "Flag"])
50
+
51
+ # Fix misspellings and inconsistencies (if any known issues exist)
52
+ correction_map = {
53
+ "emoglobin": "Hemoglobin",
54
+ "ematocrit": "Hematocrit",
55
+ "% Platelet Count": "Platelet Count",
56
+ "ymphocyte %": "Lymphocyte %",
57
+ "L Differential Type Automated": "Differential Type",
58
+ }
59
+ df["Component"] = df["Component"].replace(correction_map)
60
+
61
+ return df
62
+
63
+
64
+ def display_results(df):
65
+ """
66
+ Display the parsed data in a table format.
67
+ """
68
+ st.dataframe(df, use_container_width=True)
69
+
70
+
71
+ # Streamlit app
72
+ st.title("Blood Report Analyzer")
73
+ st.write("Upload an image of a blood test report to analyze.")
74
+
75
+ uploaded_file = st.file_uploader("Upload Image", type=["png", "jpg", "jpeg"])
76
+
77
+ if uploaded_file is not None:
78
+ try:
79
+ # Load the image
80
+ image = Image.open(uploaded_file)
81
+
82
+ # Display the uploaded image
83
+ st.image(image, caption="Uploaded Image", use_container_width=True)
84
+
85
+ # Extract text from the image
86
+ extracted_text = extract_text(image)
87
+
88
+ # Parse the extracted text into a structured format
89
+ parsed_data = clean_and_parse_extracted_text(extracted_text)
90
+
91
+ # Display the structured data
92
+ st.subheader("Parsed Blood Test Results")
93
+ display_results(parsed_data)
94
+
95
+ except Exception as e:
96
+ st.error(f"An error occurred: {e}")