Dannyar608 commited on
Commit
ce8b467
·
verified ·
1 Parent(s): 9abe9f0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +71 -139
app.py CHANGED
@@ -14,58 +14,47 @@ except Exception as e:
14
  print(f"Could not load NER model: {e}")
15
  ner_pipeline = None
16
 
17
- # ========== IMPROVED TRANSCRIPT PARSING ==========
18
- def extract_gpa(text):
19
- gpa_data = {'weighted': "N/A", 'unweighted': "N/A"}
20
- gpa_patterns = [
21
- r'Weighted GPA[\s:]*(\d\.\d{1,2})',
22
- r'GPA \(Weighted\)[\s:]*(\d\.\d{1,2})',
23
- r'Cumulative GPA \(Weighted\)[\s:]*(\d\.\d{1,2})',
24
- r'Unweighted GPA[\s:]*(\d\.\d{1,2})',
25
- r'GPA \(Unweighted\)[\s:]*(\d\.\d{1,2})',
26
- r'Cumulative GPA \(Unweighted\)[\s:]*(\d\.\d{1,2})',
27
- r'GPA[\s:]*(\d\.\d{1,2})'
28
- ]
29
- for pattern in gpa_patterns:
30
- for match in re.finditer(pattern, text, re.IGNORECASE):
31
- gpa_value = match.group(1)
32
- if 'weighted' in pattern.lower():
33
- gpa_data['weighted'] = gpa_value
34
- elif 'unweighted' in pattern.lower():
35
- gpa_data['unweighted'] = gpa_value
36
- else:
37
- if gpa_data['unweighted'] == "N/A":
38
- gpa_data['unweighted'] = gpa_value
39
- if gpa_data['weighted'] == "N/A":
40
- gpa_data['weighted'] = gpa_value
41
- return gpa_data
42
 
43
- def extract_courses_with_regex(text):
44
- patterns = [
45
- r'(?:^|\n)([A-Z]{2,}\s*-?\s*\d{3}[A-Z]?\b)\s*([A-F][+-]?|\d{2,3}%)?',
46
- r'(?:^|\n)([A-Z][a-z]+(?:\s+[A-Z]?[a-z]+)+)\s*[:\-]?\s*([A-F][+-]?|\d{2,3}%)?',
47
- r'(?:^|\n)([A-Z]{2,})\s*\d{3}\b'
48
- ]
49
- courses = []
50
- for pattern in patterns:
51
- for match in re.finditer(pattern, text, re.MULTILINE):
52
- course_name = match.group(1).strip()
53
- grade = match.group(2).strip() if match.group(2) else None
54
- courses.append({'name': course_name, 'grade': grade})
55
- return courses
56
-
57
- def extract_grade_levels(text):
58
- grade_pattern = r'(?:Grade|Year|Term)\s*[:]?\s*(\d+|Freshman|Sophomore|Junior|Senior)\b'
59
- grade_matches = list(re.finditer(grade_pattern, text, re.IGNORECASE))
60
- grade_sections = []
61
- for i, match in enumerate(grade_matches):
62
- start_pos = match.start()
63
- end_pos = grade_matches[i+1].start() if i+1 < len(grade_matches) else len(text)
64
- grade_sections.append({
65
- 'grade': match.group(1),
66
- 'text': text[start_pos:end_pos]
67
- })
68
- return grade_sections
 
 
 
 
 
 
 
 
 
69
 
70
  def parse_transcript(file):
71
  if file.name.endswith('.pdf'):
@@ -74,83 +63,40 @@ def parse_transcript(file):
74
  for page in reader.pages:
75
  text += page.extract_text() + '\n'
76
 
77
- # Try both NER and regex approaches
78
- courses = []
79
- if ner_pipeline:
80
- try:
81
- entities = ner_pipeline(text)
82
- current_course = {}
83
- for entity in entities:
84
- if entity['word'].startswith('##'):
85
- current_course['name'] = current_course.get('name', '') + entity['word'][2:]
86
- elif entity['entity'] in ['B-ORG', 'I-ORG']: # Using ORG as proxy for courses
87
- if 'name' in current_course:
88
- courses.append(current_course)
89
- current_course = {'name': entity['word']}
90
- elif entity['entity'] == 'GRADE' and current_course:
91
- current_course['grade'] = entity['word']
92
- if current_course:
93
- courses.append(current_course)
94
- except Exception as e:
95
- print(f"NER failed: {e}")
96
 
97
- # Fallback to regex if NER didn't find courses
98
- if not courses:
99
- courses = extract_courses_with_regex(text)
100
 
101
- # Organize by grade level
102
- grade_sections = extract_grade_levels(text)
103
- courses_by_grade = defaultdict(list)
104
 
105
- if grade_sections:
106
- for section in grade_sections:
107
- section_courses = extract_courses_with_regex(section['text'])
108
- for course in section_courses:
109
- course['term'] = section['grade']
110
- courses_by_grade[section['grade']].append(course)
111
- else:
112
- courses_by_grade["All"] = courses
113
 
114
- gpa_data = extract_gpa(text)
115
-
116
- output_text = "Transcript parsed successfully\n"
117
- output_text += f"Found {len(courses)} courses across {len(courses_by_grade)} grade levels\n"
 
 
 
118
 
119
  return output_text, {
120
  "gpa": gpa_data,
 
121
  "courses": dict(courses_by_grade)
122
  }
123
- elif file.name.endswith('.csv'):
124
- df = pd.read_csv(file)
125
- elif file.name.endswith('.xlsx'):
126
- df = pd.read_excel(file)
127
  else:
128
- return "Unsupported file format", None
129
-
130
- # Fallback for CSV/Excel
131
- gpa = "N/A"
132
- for col in ['GPA', 'Grade Point Average', 'Cumulative GPA']:
133
- if col in df.columns:
134
- gpa = df[col].iloc[0] if isinstance(df[col].iloc[0], (float, int)) else "N/A"
135
- break
136
-
137
- grade_level = "N/A"
138
- for col in ['Grade Level', 'Grade', 'Class', 'Year']:
139
- if col in df.columns:
140
- grade_level = df[col].iloc[0]
141
- break
142
-
143
- courses = []
144
- for col in ['Course', 'Subject', 'Course Name', 'Class']:
145
- if col in df.columns:
146
- courses = df[col].tolist()
147
- break
148
-
149
- return f"Grade Level: {grade_level}\nGPA: {gpa}", {
150
- "gpa": {"unweighted": gpa, "weighted": "N/A"},
151
- "grade_level": grade_level,
152
- "courses": courses
153
- }
154
 
155
  # ========== LEARNING STYLE QUIZ ==========
156
  learning_style_questions = [
@@ -326,28 +272,14 @@ def transcript_display(transcript_dict):
326
  courses_by_grade = transcript_dict["courses"]
327
 
328
  if isinstance(courses_by_grade, dict):
329
- for grade, courses in courses_by_grade.items():
330
- display += f"**{grade}**\n"
331
  for course in courses:
332
- if isinstance(course, dict):
333
- display += f"- {course.get('name', 'N/A')}"
334
- if 'grade' in course:
335
- display += f" (Grade: {course['grade']})"
336
- if 'term' in course:
337
- display += f" | Term: {course['term']}"
338
- display += "\n"
339
- else:
340
- display += f"- {str(course)}\n"
341
- display += "\n"
342
- elif isinstance(courses_by_grade, list):
343
- for course in courses_by_grade:
344
- if isinstance(course, dict):
345
- display += f"- {course.get('name', 'N/A')}"
346
  if 'grade' in course:
347
  display += f" (Grade: {course['grade']})"
348
  display += "\n"
349
- else:
350
- display += f"- {str(course)}\n"
351
 
352
  if 'gpa' in transcript_dict:
353
  gpa = transcript_dict['gpa']
@@ -447,8 +379,8 @@ def generate_response(message, history):
447
  with gr.Blocks() as app:
448
  with gr.Tab("Step 1: Upload Transcript"):
449
  gr.Markdown("### Upload your transcript (PDF recommended for best results)")
450
- transcript_file = gr.File(label="Transcript file", file_types=[".pdf", ".csv", ".xlsx"])
451
- transcript_output = gr.Textbox(label="Parsing Results")
452
  transcript_data = gr.State()
453
  transcript_file.change(
454
  fn=parse_transcript,
@@ -509,5 +441,5 @@ with gr.Blocks() as app:
509
  )
510
 
511
  if __name__ == "__main__":
512
- app.launch()
513
 
 
14
  print(f"Could not load NER model: {e}")
15
  ner_pipeline = None
16
 
17
+ # ========== TRANSCRIPT PARSING FUNCTIONS ==========
18
+ def extract_gpa(text, gpa_type):
19
+ pattern = rf'{gpa_type}\s*([\d\.]+)'
20
+ match = re.search(pattern, text)
21
+ return match.group(1) if match else "N/A"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
+ def extract_courses_from_table(text):
24
+ # This pattern matches the course table rows in the transcript
25
+ course_pattern = re.compile(
26
+ r'(\d{4}-\d{4})\s*' # School year
27
+ r'\|?\s*(\d+)\s*' # Grade level
28
+ r'\|?\s*([A-Z0-9]+)\s*' # Course code
29
+ r'\|?\s*([^\|]+?)\s*' # Course name (captures until next pipe)
30
+ r'(?:\|\s*[^\|]*){2}' # Skip Term and DstNumber
31
+ r'\|\s*([A-FW]?)\s*' # Grade (FG column)
32
+ r'(?:\|\s*[^\|]*)' # Skip Incl column
33
+ r'\|\s*([\d\.]+|inProgress)' # Credits
34
+ )
35
+
36
+ courses_by_grade = defaultdict(list)
37
+
38
+ for match in re.finditer(course_pattern, text):
39
+ year_range, grade_level, course_code, course_name, grade, credits = match.groups()
40
+
41
+ # Clean up course name
42
+ course_name = course_name.strip()
43
+ if 'DE:' in course_name:
44
+ course_name = course_name.replace('DE:', 'Dual Enrollment:')
45
+
46
+ course_info = {
47
+ 'name': f"{course_code} {course_name}",
48
+ 'year': year_range,
49
+ 'credits': credits
50
+ }
51
+
52
+ if grade and grade.strip():
53
+ course_info['grade'] = grade.strip()
54
+
55
+ courses_by_grade[grade_level].append(course_info)
56
+
57
+ return courses_by_grade
58
 
59
  def parse_transcript(file):
60
  if file.name.endswith('.pdf'):
 
63
  for page in reader.pages:
64
  text += page.extract_text() + '\n'
65
 
66
+ # Extract GPA information
67
+ gpa_data = {
68
+ 'weighted': extract_gpa(text, 'Weighted GPA'),
69
+ 'unweighted': extract_gpa(text, 'Un-weighted GPA')
70
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
+ # Extract current grade level
73
+ grade_match = re.search(r'Current Grade:\s*(\d+)', text)
74
+ grade_level = grade_match.group(1) if grade_match else "Unknown"
75
 
76
+ # Extract all courses with grades and year taken
77
+ courses_by_grade = extract_courses_from_table(text)
 
78
 
79
+ # Prepare output text
80
+ output_text = f"Grade Level: {grade_level}\n"
81
+ output_text += f"Weighted GPA: {gpa_data['weighted']}\n"
82
+ output_text += f"Unweighted GPA: {gpa_data['unweighted']}\n\n"
83
+ output_text += "Course History:\n"
 
 
 
84
 
85
+ for grade, courses in sorted(courses_by_grade.items()):
86
+ output_text += f"\nGrade {grade}:\n"
87
+ for course in courses:
88
+ output_text += f"- {course['name']}"
89
+ if 'grade' in course:
90
+ output_text += f" (Grade: {course['grade']})"
91
+ output_text += "\n"
92
 
93
  return output_text, {
94
  "gpa": gpa_data,
95
+ "grade_level": grade_level,
96
  "courses": dict(courses_by_grade)
97
  }
 
 
 
 
98
  else:
99
+ return "Unsupported file format (PDF only for transcript parsing)", None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
  # ========== LEARNING STYLE QUIZ ==========
102
  learning_style_questions = [
 
272
  courses_by_grade = transcript_dict["courses"]
273
 
274
  if isinstance(courses_by_grade, dict):
275
+ for grade, courses in sorted(courses_by_grade.items()):
276
+ display += f"**Grade {grade}**\n"
277
  for course in courses:
278
+ display += f"- {course['name']}"
 
 
 
 
 
 
 
 
 
 
 
 
 
279
  if 'grade' in course:
280
  display += f" (Grade: {course['grade']})"
281
  display += "\n"
282
+ display += "\n"
 
283
 
284
  if 'gpa' in transcript_dict:
285
  gpa = transcript_dict['gpa']
 
379
  with gr.Blocks() as app:
380
  with gr.Tab("Step 1: Upload Transcript"):
381
  gr.Markdown("### Upload your transcript (PDF recommended for best results)")
382
+ transcript_file = gr.File(label="Transcript file", file_types=[".pdf"])
383
+ transcript_output = gr.Textbox(label="Transcript Results", lines=10)
384
  transcript_data = gr.State()
385
  transcript_file.change(
386
  fn=parse_transcript,
 
441
  )
442
 
443
  if __name__ == "__main__":
444
+ app.launch()
445