Spaces:

rfahlevih
/

cvsummarizationsbertandt5

Sleeping

App Files Files Community

rfahlevih commited on May 27

Commit

5581268

verified ·

1 Parent(s): 8608547

Initial Commit

Browse files

Files changed (6) hide show

app.py +51 -0
classification_head/SBERT-finetuned-span-classifier-1_classification_head.pt +3 -0
ocr_utils.py +30 -0
requirements.txt +8 -0
span_classifier.py +166 -0
text_generator.py +10 -0

app.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import gradio as gr
+import os
+import tempfile
+from ocr_utils import extract_pdf_text
+from span_classifier import load_model, predict_spans, format_results, format_final_output
+from text_generator import generate_text
+sbert_model, classifier_head, tokenizer, device = load_model(
+    model_path='rfahlevih/sentence-transformer-all-mpnetv2-resume-span-classifier',
+    head_path='./classification_head/SBERT-finetuned-span-classifier-1_classification_head.pt'
+)
+def full_pipeline(pdf_file):
+    ocr_text = extract_pdf_text(pdf_file)
+    if not ocr_text:
+        return "Oops! We cannot do OCR because the PDF file has not been provided or there is an error.", None
+    spans = predict_spans(
+        full_text=ocr_text,
+        model=sbert_model,
+        classification_head=classifier_head,
+        tokenizer=tokenizer,
+        device=device
+    )
+    formatted = format_results(spans)
+    final_span_output = format_final_output(formatted)
+    generated = generate_text(final_span_output)
+    custom_filename = "result_summary.txt"
+    temp_dir = tempfile.gettempdir()
+    custom_path = os.path.join(temp_dir, custom_filename)
+    # Simpan ke file sementara
+    with open(custom_path, "w", encoding="utf-8") as f:
+        f.write(generated)
+    return generated, custom_path
+# Gradio UI
+gr.Interface(
+    fn=full_pipeline,
+    inputs=gr.File(label="Drop your CV here (.pdf)", file_types=[".pdf"]),
+    outputs=[
+        gr.Textbox(label="Summary Results"),
+        gr.File(label="Download Summary Results (.txt)")
+    ],
+    title="Curriculum Vitae Summarization using SBERT and T5",
+    description="This Curriculum Vitae summarization system was developed as part of my final project research, which focuses on problems in applicant tracking systems (ATS). To solve these problems, this system utilizes SBERT to extract important information from CVs, and uses the T5 model to generate text summaries based on previously extracted points.",
+    flagging_mode="never"
+).launch()

classification_head/SBERT-finetuned-span-classifier-1_classification_head.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4b92b82da6696737408c017405322541e7fa3e2490cf5c3716022dd2f06df94e
+size 17152

ocr_utils.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import re
+import pdf2image
+import pytesseract
+from pathlib import Path
+import os
+# pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
+def extract_pdf_text(pdf_file):
+    if pdf_file is None:
+        return None
+    try:
+        if isinstance(pdf_file, (str, Path)):
+            pdf_bytes = Path(pdf_file).read_bytes()
+        elif hasattr(pdf_file, "read"):
+            pdf_bytes = pdf_file.read()
+        else:
+            return None
+        images = pdf2image.convert_from_bytes(pdf_bytes)
+        all_text = [pytesseract.image_to_string(img) for img in images]
+        combined_text = "\n".join(all_text)
+        cleaned = re.sub(r'\s+', ' ', combined_text)
+        cleaned = re.sub(r'[^\w\s.,&%]', '', cleaned)
+        return cleaned.strip()
+    except Exception as e:
+         print(f"❌ Error during OCR: {e}")
+         return None

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+streamlit==1.30.0
+pdfminer==20191125
+pdf2image
+pytesseract
+torch==2.1.0
+sentencepiece
+sentence-transformers==4.0.2
+transformers==4.51.2

span_classifier.py ADDED Viewed

	@@ -0,0 +1,166 @@

+import torch
+import torch.nn as nn
+from sentence_transformers import SentenceTransformer
+from transformers import AutoTokenizer
+def load_model(model_path, head_path):
+    try:
+        model = SentenceTransformer(model_path)
+        classification_head = nn.Linear(model.get_sentence_embedding_dimension(), 5)
+        classification_head.load_state_dict(torch.load(head_path, map_location=torch.device('cpu')))
+        tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
+        device = torch.device('cpu')
+        model.to(device)
+        classification_head.to(device)
+        return model, classification_head, tokenizer, device
+    except Exception as e:
+        print(f"Error loading model: {e}")
+        raise
+def predict_spans(full_text, model, classification_head, tokenizer, device,
+                 window_size=384, stride=256, min_span_length=3):
+    class_thresholds = {
+        0: 0.8,
+        1: 0.7,
+        2: 0.75,
+        3: 0.7,
+        4: 0.8
+    }
+    label_map = {
+        0: 'personal_information',
+        1: 'skills',
+        2: 'education',
+        3: 'experience',
+        4: 'certification'
+    }
+    results = []
+    full_text = full_text.strip()
+    for i in range(0, len(full_text), stride):
+        window_text = full_text[i:i+window_size]
+        encoding = tokenizer(
+            window_text,
+            max_length=window_size,
+            padding='max_length',
+            truncation=True,
+            return_offsets_mapping=True,
+            return_tensors='pt'
+        ).to(device)
+        with torch.no_grad():
+            model_output = model({
+                'input_ids': encoding['input_ids'],
+                'attention_mask': encoding['attention_mask']
+            })
+            token_embeddings = model_output['token_embeddings']
+            token_logits = classification_head(token_embeddings)
+            token_probs = torch.softmax(token_logits, dim=2)
+        offset_mapping = encoding['offset_mapping'][0].cpu().numpy()
+        current_span = None
+        for token_idx, (start, end) in enumerate(offset_mapping):
+            if start == end == 0:
+                continue
+            probs = token_probs[0, token_idx]
+            max_prob, pred_label = torch.max(probs, dim=0)
+            max_prob = max_prob.item()
+            pred_label = pred_label.item()
+            if max_prob > class_thresholds[pred_label]:
+                token_text = window_text[start:end]
+                if token_text.startswith('##'):
+                    if current_span and current_span['label'] == label_map[pred_label]:
+                        current_span['text'] += token_text[2:]
+                        current_span['position'] = (current_span['position'][0], i+end)
+                        current_span['confidence'] = max(current_span['confidence'], max_prob)
+                        continue
+                if (current_span and
+                    current_span['label'] == label_map[pred_label] and
+                    (i+start - current_span['position'][1]) <= 2):
+                    current_span['text'] += ' ' + token_text
+                    current_span['position'] = (current_span['position'][0], i+end)
+                    current_span['confidence'] = max(current_span['confidence'], max_prob)
+                else:
+                    if current_span:
+                        results.append(current_span)
+                    current_span = {
+                        'text': token_text,
+                        'label': label_map[pred_label],
+                        'confidence': max_prob,
+                        'position': (i+start, i+end)
+                    }
+            else:
+                if current_span:
+                    results.append(current_span)
+                    current_span = None
+        if current_span:
+            results.append(current_span)
+    filtered_results = []
+    for span in results:
+        clean_text = span['text'].strip()
+        if len(clean_text.split()) >= min_span_length or span['confidence'] > 0.9:
+            span['text'] = clean_text
+            filtered_results.append(span)
+    merged_results = []
+    filtered_results.sort(key=lambda x: x['position'][0])
+    for span in filtered_results:
+        if not merged_results:
+            merged_results.append(span)
+        else:
+            last = merged_results[-1]
+            if (span['label'] == last['label'] and
+                span['position'][0] <= last['position'][1] + 5):
+                merged_text = last['text'] + ' ' + span['text']
+                merged_results[-1] = {
+                    'text': merged_text,
+                    'label': span['label'],
+                    'confidence': max(last['confidence'], span['confidence']),
+                    'position': (last['position'][0], span['position'][1])
+                }
+            else:
+                merged_results.append(span)
+    for span in merged_results:
+        tokens = span['text'].split()
+        if len(tokens) > 15:
+            span['text'] = ' '.join(tokens[:15])
+    return merged_results
+def format_results(spans):
+    formatted = {}
+    for span in spans:
+        label = span['label']
+        if label not in formatted:
+            formatted[label] = []
+        formatted[label].append(span)
+    for label in formatted:
+        formatted[label].sort(key=lambda x: x['confidence'], reverse=True)
+    return formatted
+def format_final_output(formatted_results):
+    final_output = []
+    for label, items in formatted_results.items():
+        top_n = 1 if label == 'personal_information' else 3
+        label_upper = label.upper()
+        for item in items[:top_n]:
+            final_output.append(f"{label_upper}: {item['text']}")
+    return " ".join(final_output)

text_generator.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from transformers import T5Tokenizer, T5ForConditionalGeneration
+model_source = "rfahlevih/t5-small-finetuned-resume-text-generation"
+tokenizer = T5Tokenizer.from_pretrained(model_source)
+model = T5ForConditionalGeneration.from_pretrained(model_source)
+def generate_text(input_text):
+    input_ids = tokenizer(input_text, return_tensors='pt', truncation=True, padding="max_length", max_length=512).input_ids
+    outputs = model.generate(input_ids, max_length=512, num_beams=4, early_stopping=True)
+    return tokenizer.decode(outputs[0], skip_special_tokens=True)