Spaces:
Sleeping
Sleeping
import os | |
from langchain_openai import ChatOpenAI | |
from pydantic import BaseModel | |
from langchain_core.output_parsers import JsonOutputParser | |
from langchain_core.output_parsers import PydanticOutputParser | |
from langchain_core.prompts import PromptTemplate | |
from langchain_openai import OpenAI | |
from langchain_openai import ChatOpenAI | |
from pydantic import BaseModel | |
from typing import List | |
from dotenv import load_dotenv | |
from transformers import AutoTokenizer, AutoModelForTokenClassification | |
import torch | |
import sys | |
from tabulate import tabulate | |
load_dotenv(".env") | |
### LLM-based tag extraction with few-shot learning | |
model = ChatOpenAI(temperature=0) | |
class TokenTaggingResult(BaseModel): | |
tokens: List[str] | |
skill_labels: List[str] | |
knowledge_labels: List[str] | |
model = ChatOpenAI(model_name="gpt-4o", temperature=0.0, api_key=os.getenv('OPENAI_API_KEY')) | |
tokenizer = AutoTokenizer.from_pretrained("jjzha/jobbert_skill_extraction") | |
parser = JsonOutputParser(pydantic_object=TokenTaggingResult) | |
# Definitions | |
skill_definition = """ | |
Skill means the ability to apply knowledge and use know-how to complete tasks and solve problems. | |
""" | |
knowledge_definition = """ | |
Knowledge means the outcome of the assimilation of information through learning. Knowledge is the body of facts, principles, theories and practices that is related to a field of work or study. | |
""" | |
# Few-shot examples | |
with open('few-shot.txt', 'r') as file: | |
few_shot_examples = file.read() | |
prompt = PromptTemplate( | |
template="""You are an expert in tagging tokens with skill and knowledge labels. Use the following definitions to tag the input tokens: | |
Skill definition:{skill_definition} | |
Knowledge definition:{knowledge_definition} | |
Use the examples below to tag the input text into relevant knowledge or skills categories.\n{few_shot_examples}\n{format_instructions}\n{input}\n""", | |
input_variables=["input"], | |
partial_variables={"format_instructions": parser.get_format_instructions(), | |
"few_shot_examples": few_shot_examples, | |
"skill_definition": skill_definition, | |
"knowledge_definition": knowledge_definition}, | |
) | |
def extract_tags(text: str, tokenize = True) -> TokenTaggingResult: | |
if tokenize: | |
inputs = tokenizer(text, return_tensors="pt") | |
tokens = tokenizer.decode(inputs['input_ids'].squeeze()).split()[1:-1] | |
prompt_and_model = prompt | model | |
output = prompt_and_model.invoke({"input": tokens}) | |
output = parser.invoke(output) | |
return tokens, output | |
### Pre-trained model from Hugging Face | |
mapping = {0: 'B', 1: 'I', 2: 'O'} | |
token_skill_classifier = AutoModelForTokenClassification.from_pretrained("jjzha/jobbert_skill_extraction") | |
token_knowledge_classifier = AutoModelForTokenClassification.from_pretrained("jjzha/jobbert_knowledge_extraction") | |
def convert(text): | |
inputs = tokenizer(text, return_tensors="pt") | |
with torch.no_grad(): | |
skill_outputs = token_skill_classifier(**inputs) | |
knowledge_outputs = token_knowledge_classifier(**inputs) | |
decoded_tokens = tokenizer.decode(inputs['input_ids'].squeeze()).split()[1:-1] | |
skill_cls = skill_outputs.logits.argmax(dim=2).squeeze()[1:-1] | |
knowledge_cls = knowledge_outputs.logits.argmax(dim=2).squeeze()[1:-1] | |
skill_cls = [mapping[i.item()] for i in skill_cls] | |
knowledge_cls = [mapping[i.item()] for i in knowledge_cls] | |
return skill_cls, knowledge_cls | |
if __name__ == "__main__": | |
text = input('Enter text: ') | |
# LLM-based tag extraction | |
tokens, output = extract_tags(text, tokenize=True) | |
# Pre-trained | |
skill_cls, knowledge_cls = convert(text) | |
table = zip(tokens, output['skill_labels'], output['knowledge_labels'], skill_cls, knowledge_cls) | |
headers = ["Token", "Skill Label", "Knowledge Label", "Pred Skill Label", "Pred Knowledge Label"] | |
print(tabulate(table, headers=headers, tablefmt="pretty")) |