|
|
|
from flair.data import Sentence |
|
from flair.models import SequenceTagger |
|
|
|
|
|
|
|
import json |
|
import os |
|
from dotenv import load_dotenv |
|
from pathlib import Path |
|
env_path = Path('.') / '.env' |
|
load_dotenv(dotenv_path=env_path) |
|
import re |
|
import string |
|
import nltk |
|
from nltk.corpus import stopwords |
|
|
|
nltk.download('stopwords') |
|
nltk.download('punkt') |
|
|
|
path = { |
|
'NER_MODEL_PATH': str(os.getenv('NER_MODEL_PATH')), |
|
} |
|
model = SequenceTagger.load(path['NER_MODEL_PATH']) |
|
|
|
|
|
def inference(sent): |
|
|
|
sent = clean_sent(sent) |
|
sentence = Sentence(sent) |
|
|
|
res = model.predict(sentence) |
|
res = sentence.to_dict() |
|
|
|
label_name = {'GCNUM':[],'TRACK-ID':[],'Company':"" } |
|
for label in res['entities']: |
|
sorted_labels = sorted(label['labels'], key=lambda k: k['confidence']) |
|
if sorted_labels[0]['value'] not in label_name: |
|
label_name[sorted_labels[0]['value']].append(label['text']) |
|
else: |
|
label_name[sorted_labels[0]['value']].append(label['text']) |
|
|
|
|
|
if label_name['GCNUM'] is None or len(label_name['GCNUM']) == 0: |
|
pattern = r'[G,O,Q][C,O,Q]\d{5}' |
|
result = re.findall(pattern, sent, re.IGNORECASE) |
|
if result is not None and len(result) > 0: |
|
label_name['GCNUM'].append(result[0]) |
|
|
|
label_name['GCNUM'] = label_name['GCNUM'] |
|
label_name['TRACK-ID'] = label_name['TRACK-ID'] |
|
|
|
|
|
label_name['Company'] = get_company_name(sent.lower()) |
|
|
|
|
|
|
|
return label_name |
|
|
|
def get_company_name(sent): |
|
patterns = [] |
|
patterns.append({'company': "FeDex", 'pattern' : re.compile(r"\b{}\b".format("fedex")),"track_pattern":re.compile(r"\b{}\b".format("[0-9]{12}"))}) |
|
patterns.append({'company':'UPS', 'pattern':re.compile(r"\b{}\b".format("ups")),"track_pattern":re.compile(r"\b{}\b".format("1Z"))}) |
|
patterns.append({'company':'USPS', 'pattern':re.compile(r"\b{}\b".format("usps")),"track_pattern":re.compile(r"\b{}\b".format("42033155"))}) |
|
patterns.append({'company':'onTrack', 'pattern':re.compile(r"\b{}\b".format("ontrack")),"track_pattern":re.compile(r"\b{}\b".format("BG[0-9]{5}"))}) |
|
patterns.append({'company':'Lasership', 'pattern':re.compile(r"\b{}\b".format('1LS')),"track_pattern":re.compile(r"\b{}\b".format('1LS'))}) |
|
patterns.append({'company':'Amazon', 'pattern':re.compile(r"\b{}\b".format("dmi6 | dm16")),"track_pattern":re.compile(r"\b{}\b".format("T[B,8]A"))}) |
|
|
|
for pattern in patterns: |
|
if pattern['pattern'].search(sent) is not None: |
|
return pattern['company'] |
|
|
|
for pattern in patterns: |
|
if pattern['track_pattern'].search(sent) is not None: |
|
return pattern['company'] |
|
return None |
|
|
|
|
|
def clean_sent(sent): |
|
|
|
|
|
sent = sent.translate(str.maketrans('', '', string.punctuation)) |
|
|
|
|
|
tokens = nltk.word_tokenize(sent) |
|
|
|
|
|
stop_words = set(stopwords.words('english')) |
|
tokens = [token for token in tokens if token.lower() not in stop_words] |
|
|
|
|
|
sent = ' '.join(tokens) |
|
return sent |
|
|
|
|
|
|