Spaces:
Sleeping
Sleeping
Initial Commit
Browse files- app.py +51 -0
- classification_head/SBERT-finetuned-span-classifier-1_classification_head.pt +3 -0
- ocr_utils.py +30 -0
- requirements.txt +8 -0
- span_classifier.py +166 -0
- text_generator.py +10 -0
app.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import os
|
3 |
+
import tempfile
|
4 |
+
from ocr_utils import extract_pdf_text
|
5 |
+
from span_classifier import load_model, predict_spans, format_results, format_final_output
|
6 |
+
from text_generator import generate_text
|
7 |
+
|
8 |
+
sbert_model, classifier_head, tokenizer, device = load_model(
|
9 |
+
model_path='rfahlevih/sentence-transformer-all-mpnetv2-resume-span-classifier',
|
10 |
+
head_path='./classification_head/SBERT-finetuned-span-classifier-1_classification_head.pt'
|
11 |
+
)
|
12 |
+
|
13 |
+
def full_pipeline(pdf_file):
|
14 |
+
ocr_text = extract_pdf_text(pdf_file)
|
15 |
+
if not ocr_text:
|
16 |
+
return "Oops! We cannot do OCR because the PDF file has not been provided or there is an error.", None
|
17 |
+
|
18 |
+
spans = predict_spans(
|
19 |
+
full_text=ocr_text,
|
20 |
+
model=sbert_model,
|
21 |
+
classification_head=classifier_head,
|
22 |
+
tokenizer=tokenizer,
|
23 |
+
device=device
|
24 |
+
)
|
25 |
+
|
26 |
+
formatted = format_results(spans)
|
27 |
+
final_span_output = format_final_output(formatted)
|
28 |
+
generated = generate_text(final_span_output)
|
29 |
+
|
30 |
+
custom_filename = "result_summary.txt"
|
31 |
+
temp_dir = tempfile.gettempdir()
|
32 |
+
custom_path = os.path.join(temp_dir, custom_filename)
|
33 |
+
|
34 |
+
# Simpan ke file sementara
|
35 |
+
with open(custom_path, "w", encoding="utf-8") as f:
|
36 |
+
f.write(generated)
|
37 |
+
|
38 |
+
return generated, custom_path
|
39 |
+
|
40 |
+
# Gradio UI
|
41 |
+
gr.Interface(
|
42 |
+
fn=full_pipeline,
|
43 |
+
inputs=gr.File(label="Drop your CV here (.pdf)", file_types=[".pdf"]),
|
44 |
+
outputs=[
|
45 |
+
gr.Textbox(label="Summary Results"),
|
46 |
+
gr.File(label="Download Summary Results (.txt)")
|
47 |
+
],
|
48 |
+
title="Curriculum Vitae Summarization using SBERT and T5",
|
49 |
+
description="This Curriculum Vitae summarization system was developed as part of my final project research, which focuses on problems in applicant tracking systems (ATS). To solve these problems, this system utilizes SBERT to extract important information from CVs, and uses the T5 model to generate text summaries based on previously extracted points.",
|
50 |
+
flagging_mode="never"
|
51 |
+
).launch()
|
classification_head/SBERT-finetuned-span-classifier-1_classification_head.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4b92b82da6696737408c017405322541e7fa3e2490cf5c3716022dd2f06df94e
|
3 |
+
size 17152
|
ocr_utils.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import pdf2image
|
3 |
+
import pytesseract
|
4 |
+
from pathlib import Path
|
5 |
+
import os
|
6 |
+
|
7 |
+
# pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
|
8 |
+
|
9 |
+
def extract_pdf_text(pdf_file):
|
10 |
+
if pdf_file is None:
|
11 |
+
return None
|
12 |
+
|
13 |
+
try:
|
14 |
+
if isinstance(pdf_file, (str, Path)):
|
15 |
+
pdf_bytes = Path(pdf_file).read_bytes()
|
16 |
+
elif hasattr(pdf_file, "read"):
|
17 |
+
pdf_bytes = pdf_file.read()
|
18 |
+
else:
|
19 |
+
return None
|
20 |
+
|
21 |
+
images = pdf2image.convert_from_bytes(pdf_bytes)
|
22 |
+
all_text = [pytesseract.image_to_string(img) for img in images]
|
23 |
+
combined_text = "\n".join(all_text)
|
24 |
+
cleaned = re.sub(r'\s+', ' ', combined_text)
|
25 |
+
cleaned = re.sub(r'[^\w\s.,&%]', '', cleaned)
|
26 |
+
return cleaned.strip()
|
27 |
+
|
28 |
+
except Exception as e:
|
29 |
+
print(f"❌ Error during OCR: {e}")
|
30 |
+
return None
|
requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit==1.30.0
|
2 |
+
pdfminer==20191125
|
3 |
+
pdf2image
|
4 |
+
pytesseract
|
5 |
+
torch==2.1.0
|
6 |
+
sentencepiece
|
7 |
+
sentence-transformers==4.0.2
|
8 |
+
transformers==4.51.2
|
span_classifier.py
ADDED
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
from sentence_transformers import SentenceTransformer
|
4 |
+
from transformers import AutoTokenizer
|
5 |
+
|
6 |
+
def load_model(model_path, head_path):
|
7 |
+
try:
|
8 |
+
model = SentenceTransformer(model_path)
|
9 |
+
classification_head = nn.Linear(model.get_sentence_embedding_dimension(), 5)
|
10 |
+
classification_head.load_state_dict(torch.load(head_path, map_location=torch.device('cpu')))
|
11 |
+
|
12 |
+
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
|
13 |
+
device = torch.device('cpu')
|
14 |
+
|
15 |
+
model.to(device)
|
16 |
+
classification_head.to(device)
|
17 |
+
|
18 |
+
return model, classification_head, tokenizer, device
|
19 |
+
except Exception as e:
|
20 |
+
print(f"Error loading model: {e}")
|
21 |
+
raise
|
22 |
+
|
23 |
+
def predict_spans(full_text, model, classification_head, tokenizer, device,
|
24 |
+
window_size=384, stride=256, min_span_length=3):
|
25 |
+
class_thresholds = {
|
26 |
+
0: 0.8,
|
27 |
+
1: 0.7,
|
28 |
+
2: 0.75,
|
29 |
+
3: 0.7,
|
30 |
+
4: 0.8
|
31 |
+
}
|
32 |
+
|
33 |
+
label_map = {
|
34 |
+
0: 'personal_information',
|
35 |
+
1: 'skills',
|
36 |
+
2: 'education',
|
37 |
+
3: 'experience',
|
38 |
+
4: 'certification'
|
39 |
+
}
|
40 |
+
|
41 |
+
results = []
|
42 |
+
full_text = full_text.strip()
|
43 |
+
|
44 |
+
for i in range(0, len(full_text), stride):
|
45 |
+
window_text = full_text[i:i+window_size]
|
46 |
+
|
47 |
+
encoding = tokenizer(
|
48 |
+
window_text,
|
49 |
+
max_length=window_size,
|
50 |
+
padding='max_length',
|
51 |
+
truncation=True,
|
52 |
+
return_offsets_mapping=True,
|
53 |
+
return_tensors='pt'
|
54 |
+
).to(device)
|
55 |
+
|
56 |
+
with torch.no_grad():
|
57 |
+
model_output = model({
|
58 |
+
'input_ids': encoding['input_ids'],
|
59 |
+
'attention_mask': encoding['attention_mask']
|
60 |
+
})
|
61 |
+
token_embeddings = model_output['token_embeddings']
|
62 |
+
token_logits = classification_head(token_embeddings)
|
63 |
+
token_probs = torch.softmax(token_logits, dim=2)
|
64 |
+
|
65 |
+
offset_mapping = encoding['offset_mapping'][0].cpu().numpy()
|
66 |
+
current_span = None
|
67 |
+
|
68 |
+
for token_idx, (start, end) in enumerate(offset_mapping):
|
69 |
+
if start == end == 0:
|
70 |
+
continue
|
71 |
+
|
72 |
+
probs = token_probs[0, token_idx]
|
73 |
+
max_prob, pred_label = torch.max(probs, dim=0)
|
74 |
+
max_prob = max_prob.item()
|
75 |
+
pred_label = pred_label.item()
|
76 |
+
|
77 |
+
if max_prob > class_thresholds[pred_label]:
|
78 |
+
token_text = window_text[start:end]
|
79 |
+
|
80 |
+
if token_text.startswith('##'):
|
81 |
+
if current_span and current_span['label'] == label_map[pred_label]:
|
82 |
+
current_span['text'] += token_text[2:]
|
83 |
+
current_span['position'] = (current_span['position'][0], i+end)
|
84 |
+
current_span['confidence'] = max(current_span['confidence'], max_prob)
|
85 |
+
continue
|
86 |
+
|
87 |
+
if (current_span and
|
88 |
+
current_span['label'] == label_map[pred_label] and
|
89 |
+
(i+start - current_span['position'][1]) <= 2):
|
90 |
+
|
91 |
+
current_span['text'] += ' ' + token_text
|
92 |
+
current_span['position'] = (current_span['position'][0], i+end)
|
93 |
+
current_span['confidence'] = max(current_span['confidence'], max_prob)
|
94 |
+
else:
|
95 |
+
if current_span:
|
96 |
+
results.append(current_span)
|
97 |
+
current_span = {
|
98 |
+
'text': token_text,
|
99 |
+
'label': label_map[pred_label],
|
100 |
+
'confidence': max_prob,
|
101 |
+
'position': (i+start, i+end)
|
102 |
+
}
|
103 |
+
else:
|
104 |
+
if current_span:
|
105 |
+
results.append(current_span)
|
106 |
+
current_span = None
|
107 |
+
|
108 |
+
if current_span:
|
109 |
+
results.append(current_span)
|
110 |
+
|
111 |
+
filtered_results = []
|
112 |
+
for span in results:
|
113 |
+
clean_text = span['text'].strip()
|
114 |
+
if len(clean_text.split()) >= min_span_length or span['confidence'] > 0.9:
|
115 |
+
span['text'] = clean_text
|
116 |
+
filtered_results.append(span)
|
117 |
+
|
118 |
+
merged_results = []
|
119 |
+
filtered_results.sort(key=lambda x: x['position'][0])
|
120 |
+
|
121 |
+
for span in filtered_results:
|
122 |
+
if not merged_results:
|
123 |
+
merged_results.append(span)
|
124 |
+
else:
|
125 |
+
last = merged_results[-1]
|
126 |
+
if (span['label'] == last['label'] and
|
127 |
+
span['position'][0] <= last['position'][1] + 5):
|
128 |
+
|
129 |
+
merged_text = last['text'] + ' ' + span['text']
|
130 |
+
merged_results[-1] = {
|
131 |
+
'text': merged_text,
|
132 |
+
'label': span['label'],
|
133 |
+
'confidence': max(last['confidence'], span['confidence']),
|
134 |
+
'position': (last['position'][0], span['position'][1])
|
135 |
+
}
|
136 |
+
else:
|
137 |
+
merged_results.append(span)
|
138 |
+
|
139 |
+
for span in merged_results:
|
140 |
+
tokens = span['text'].split()
|
141 |
+
if len(tokens) > 15:
|
142 |
+
span['text'] = ' '.join(tokens[:15])
|
143 |
+
|
144 |
+
return merged_results
|
145 |
+
|
146 |
+
def format_results(spans):
|
147 |
+
formatted = {}
|
148 |
+
for span in spans:
|
149 |
+
label = span['label']
|
150 |
+
if label not in formatted:
|
151 |
+
formatted[label] = []
|
152 |
+
formatted[label].append(span)
|
153 |
+
|
154 |
+
for label in formatted:
|
155 |
+
formatted[label].sort(key=lambda x: x['confidence'], reverse=True)
|
156 |
+
|
157 |
+
return formatted
|
158 |
+
|
159 |
+
def format_final_output(formatted_results):
|
160 |
+
final_output = []
|
161 |
+
for label, items in formatted_results.items():
|
162 |
+
top_n = 1 if label == 'personal_information' else 3
|
163 |
+
label_upper = label.upper()
|
164 |
+
for item in items[:top_n]:
|
165 |
+
final_output.append(f"{label_upper}: {item['text']}")
|
166 |
+
return " ".join(final_output)
|
text_generator.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import T5Tokenizer, T5ForConditionalGeneration
|
2 |
+
|
3 |
+
model_source = "rfahlevih/t5-small-finetuned-resume-text-generation"
|
4 |
+
tokenizer = T5Tokenizer.from_pretrained(model_source)
|
5 |
+
model = T5ForConditionalGeneration.from_pretrained(model_source)
|
6 |
+
|
7 |
+
def generate_text(input_text):
|
8 |
+
input_ids = tokenizer(input_text, return_tensors='pt', truncation=True, padding="max_length", max_length=512).input_ids
|
9 |
+
outputs = model.generate(input_ids, max_length=512, num_beams=4, early_stopping=True)
|
10 |
+
return tokenizer.decode(outputs[0], skip_special_tokens=True)
|