rfahlevih commited on
Commit
5581268
·
verified ·
1 Parent(s): 8608547

Initial Commit

Browse files
app.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import tempfile
4
+ from ocr_utils import extract_pdf_text
5
+ from span_classifier import load_model, predict_spans, format_results, format_final_output
6
+ from text_generator import generate_text
7
+
8
+ sbert_model, classifier_head, tokenizer, device = load_model(
9
+ model_path='rfahlevih/sentence-transformer-all-mpnetv2-resume-span-classifier',
10
+ head_path='./classification_head/SBERT-finetuned-span-classifier-1_classification_head.pt'
11
+ )
12
+
13
+ def full_pipeline(pdf_file):
14
+ ocr_text = extract_pdf_text(pdf_file)
15
+ if not ocr_text:
16
+ return "Oops! We cannot do OCR because the PDF file has not been provided or there is an error.", None
17
+
18
+ spans = predict_spans(
19
+ full_text=ocr_text,
20
+ model=sbert_model,
21
+ classification_head=classifier_head,
22
+ tokenizer=tokenizer,
23
+ device=device
24
+ )
25
+
26
+ formatted = format_results(spans)
27
+ final_span_output = format_final_output(formatted)
28
+ generated = generate_text(final_span_output)
29
+
30
+ custom_filename = "result_summary.txt"
31
+ temp_dir = tempfile.gettempdir()
32
+ custom_path = os.path.join(temp_dir, custom_filename)
33
+
34
+ # Simpan ke file sementara
35
+ with open(custom_path, "w", encoding="utf-8") as f:
36
+ f.write(generated)
37
+
38
+ return generated, custom_path
39
+
40
+ # Gradio UI
41
+ gr.Interface(
42
+ fn=full_pipeline,
43
+ inputs=gr.File(label="Drop your CV here (.pdf)", file_types=[".pdf"]),
44
+ outputs=[
45
+ gr.Textbox(label="Summary Results"),
46
+ gr.File(label="Download Summary Results (.txt)")
47
+ ],
48
+ title="Curriculum Vitae Summarization using SBERT and T5",
49
+ description="This Curriculum Vitae summarization system was developed as part of my final project research, which focuses on problems in applicant tracking systems (ATS). To solve these problems, this system utilizes SBERT to extract important information from CVs, and uses the T5 model to generate text summaries based on previously extracted points.",
50
+ flagging_mode="never"
51
+ ).launch()
classification_head/SBERT-finetuned-span-classifier-1_classification_head.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4b92b82da6696737408c017405322541e7fa3e2490cf5c3716022dd2f06df94e
3
+ size 17152
ocr_utils.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import pdf2image
3
+ import pytesseract
4
+ from pathlib import Path
5
+ import os
6
+
7
+ # pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
8
+
9
+ def extract_pdf_text(pdf_file):
10
+ if pdf_file is None:
11
+ return None
12
+
13
+ try:
14
+ if isinstance(pdf_file, (str, Path)):
15
+ pdf_bytes = Path(pdf_file).read_bytes()
16
+ elif hasattr(pdf_file, "read"):
17
+ pdf_bytes = pdf_file.read()
18
+ else:
19
+ return None
20
+
21
+ images = pdf2image.convert_from_bytes(pdf_bytes)
22
+ all_text = [pytesseract.image_to_string(img) for img in images]
23
+ combined_text = "\n".join(all_text)
24
+ cleaned = re.sub(r'\s+', ' ', combined_text)
25
+ cleaned = re.sub(r'[^\w\s.,&%]', '', cleaned)
26
+ return cleaned.strip()
27
+
28
+ except Exception as e:
29
+ print(f"❌ Error during OCR: {e}")
30
+ return None
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ streamlit==1.30.0
2
+ pdfminer==20191125
3
+ pdf2image
4
+ pytesseract
5
+ torch==2.1.0
6
+ sentencepiece
7
+ sentence-transformers==4.0.2
8
+ transformers==4.51.2
span_classifier.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from sentence_transformers import SentenceTransformer
4
+ from transformers import AutoTokenizer
5
+
6
+ def load_model(model_path, head_path):
7
+ try:
8
+ model = SentenceTransformer(model_path)
9
+ classification_head = nn.Linear(model.get_sentence_embedding_dimension(), 5)
10
+ classification_head.load_state_dict(torch.load(head_path, map_location=torch.device('cpu')))
11
+
12
+ tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
13
+ device = torch.device('cpu')
14
+
15
+ model.to(device)
16
+ classification_head.to(device)
17
+
18
+ return model, classification_head, tokenizer, device
19
+ except Exception as e:
20
+ print(f"Error loading model: {e}")
21
+ raise
22
+
23
+ def predict_spans(full_text, model, classification_head, tokenizer, device,
24
+ window_size=384, stride=256, min_span_length=3):
25
+ class_thresholds = {
26
+ 0: 0.8,
27
+ 1: 0.7,
28
+ 2: 0.75,
29
+ 3: 0.7,
30
+ 4: 0.8
31
+ }
32
+
33
+ label_map = {
34
+ 0: 'personal_information',
35
+ 1: 'skills',
36
+ 2: 'education',
37
+ 3: 'experience',
38
+ 4: 'certification'
39
+ }
40
+
41
+ results = []
42
+ full_text = full_text.strip()
43
+
44
+ for i in range(0, len(full_text), stride):
45
+ window_text = full_text[i:i+window_size]
46
+
47
+ encoding = tokenizer(
48
+ window_text,
49
+ max_length=window_size,
50
+ padding='max_length',
51
+ truncation=True,
52
+ return_offsets_mapping=True,
53
+ return_tensors='pt'
54
+ ).to(device)
55
+
56
+ with torch.no_grad():
57
+ model_output = model({
58
+ 'input_ids': encoding['input_ids'],
59
+ 'attention_mask': encoding['attention_mask']
60
+ })
61
+ token_embeddings = model_output['token_embeddings']
62
+ token_logits = classification_head(token_embeddings)
63
+ token_probs = torch.softmax(token_logits, dim=2)
64
+
65
+ offset_mapping = encoding['offset_mapping'][0].cpu().numpy()
66
+ current_span = None
67
+
68
+ for token_idx, (start, end) in enumerate(offset_mapping):
69
+ if start == end == 0:
70
+ continue
71
+
72
+ probs = token_probs[0, token_idx]
73
+ max_prob, pred_label = torch.max(probs, dim=0)
74
+ max_prob = max_prob.item()
75
+ pred_label = pred_label.item()
76
+
77
+ if max_prob > class_thresholds[pred_label]:
78
+ token_text = window_text[start:end]
79
+
80
+ if token_text.startswith('##'):
81
+ if current_span and current_span['label'] == label_map[pred_label]:
82
+ current_span['text'] += token_text[2:]
83
+ current_span['position'] = (current_span['position'][0], i+end)
84
+ current_span['confidence'] = max(current_span['confidence'], max_prob)
85
+ continue
86
+
87
+ if (current_span and
88
+ current_span['label'] == label_map[pred_label] and
89
+ (i+start - current_span['position'][1]) <= 2):
90
+
91
+ current_span['text'] += ' ' + token_text
92
+ current_span['position'] = (current_span['position'][0], i+end)
93
+ current_span['confidence'] = max(current_span['confidence'], max_prob)
94
+ else:
95
+ if current_span:
96
+ results.append(current_span)
97
+ current_span = {
98
+ 'text': token_text,
99
+ 'label': label_map[pred_label],
100
+ 'confidence': max_prob,
101
+ 'position': (i+start, i+end)
102
+ }
103
+ else:
104
+ if current_span:
105
+ results.append(current_span)
106
+ current_span = None
107
+
108
+ if current_span:
109
+ results.append(current_span)
110
+
111
+ filtered_results = []
112
+ for span in results:
113
+ clean_text = span['text'].strip()
114
+ if len(clean_text.split()) >= min_span_length or span['confidence'] > 0.9:
115
+ span['text'] = clean_text
116
+ filtered_results.append(span)
117
+
118
+ merged_results = []
119
+ filtered_results.sort(key=lambda x: x['position'][0])
120
+
121
+ for span in filtered_results:
122
+ if not merged_results:
123
+ merged_results.append(span)
124
+ else:
125
+ last = merged_results[-1]
126
+ if (span['label'] == last['label'] and
127
+ span['position'][0] <= last['position'][1] + 5):
128
+
129
+ merged_text = last['text'] + ' ' + span['text']
130
+ merged_results[-1] = {
131
+ 'text': merged_text,
132
+ 'label': span['label'],
133
+ 'confidence': max(last['confidence'], span['confidence']),
134
+ 'position': (last['position'][0], span['position'][1])
135
+ }
136
+ else:
137
+ merged_results.append(span)
138
+
139
+ for span in merged_results:
140
+ tokens = span['text'].split()
141
+ if len(tokens) > 15:
142
+ span['text'] = ' '.join(tokens[:15])
143
+
144
+ return merged_results
145
+
146
+ def format_results(spans):
147
+ formatted = {}
148
+ for span in spans:
149
+ label = span['label']
150
+ if label not in formatted:
151
+ formatted[label] = []
152
+ formatted[label].append(span)
153
+
154
+ for label in formatted:
155
+ formatted[label].sort(key=lambda x: x['confidence'], reverse=True)
156
+
157
+ return formatted
158
+
159
+ def format_final_output(formatted_results):
160
+ final_output = []
161
+ for label, items in formatted_results.items():
162
+ top_n = 1 if label == 'personal_information' else 3
163
+ label_upper = label.upper()
164
+ for item in items[:top_n]:
165
+ final_output.append(f"{label_upper}: {item['text']}")
166
+ return " ".join(final_output)
text_generator.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import T5Tokenizer, T5ForConditionalGeneration
2
+
3
+ model_source = "rfahlevih/t5-small-finetuned-resume-text-generation"
4
+ tokenizer = T5Tokenizer.from_pretrained(model_source)
5
+ model = T5ForConditionalGeneration.from_pretrained(model_source)
6
+
7
+ def generate_text(input_text):
8
+ input_ids = tokenizer(input_text, return_tensors='pt', truncation=True, padding="max_length", max_length=512).input_ids
9
+ outputs = model.generate(input_ids, max_length=512, num_beams=4, early_stopping=True)
10
+ return tokenizer.decode(outputs[0], skip_special_tokens=True)