llm tagging & training functions done
Browse files- config.yaml +4 -0
 - data/data.jsonl +35 -0
 - few-shot-extract.py +10 -11
 - few_shot.txt +299 -0
 - llm-tagging.py +21 -92
 - train.py +113 -129
 
    	
        config.yaml
    ADDED
    
    | 
         @@ -0,0 +1,4 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            training:
         
     | 
| 2 | 
         
            +
                epochs: 3
         
     | 
| 3 | 
         
            +
                batch_size: 16
         
     | 
| 4 | 
         
            +
                learning_rate: 0.00005
         
     | 
    	
        data/data.jsonl
    ADDED
    
    | 
         @@ -0,0 +1,35 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            {"tokens": ["About", "the", "job"], "tags_knowledge": ["O", "O", "O"]}
         
     | 
| 2 | 
         
            +
            {"tokens": ["G", "##row", "with", "us"], "tags_knowledge": ["O", "O", "O", "O"]}
         
     | 
| 3 | 
         
            +
            {"tokens": ["About", "This", "Op", "##port", "##unity"], "tags_knowledge": ["O", "O", "O", "O", "O"]}
         
     | 
| 4 | 
         
            +
            {"tokens": ["Eric", "##sson", "is", "a", "world", "-", "leading", "provider", "of", "telecommunications", "equipment", "and", "services", "to", "mobile", "and", "fixed", "network", "operators", "."], "tags_knowledge": ["B", "I", "O", "O", "O", "O", "O", "O", "O", "B", "O", "O", "O", "O", "B", "O", "O", "O", "O"]}
         
     | 
| 5 | 
         
            +
            {"tokens": ["Over", "1", ",", "000", "networks", "in", "more", "than", "180", "countries", "use", "Eric", "##sson", "equipment", ",", "and", "more", "than", "40", "percent", "of", "the", "world", "'", "s", "mobile", "traffic", "passes", "through", "Eric", "##sson", "networks", "."], "tags_knowledge": ["O", "O", "O", "O", "B", "O", "O", "O", "O", "O", "O", "B", "I", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "O", "O", "O", "B", "I", "O"]}
         
     | 
| 6 | 
         
            +
            {"tokens": ["Using", "innovation", "to", "em", "##power", "people", ",", "business", "and", "society", ",", "Eric", "##sson", "is", "working", "towards", "the", "Network", "##ed", "Society", ":", "a", "world", "connected", "in", "real", "time", "that", "will", "open", "opportunities", "to", "create", "freedom", ",", "transform", "society", "and", "drive", "solutions", "to", "some", "of", "our", "planet", "\u2019", "s", "greatest", "challenges", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "I", "O", "O", "O", "O", "B", "I", "I", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
         
     | 
| 7 | 
         
            +
            {"tokens": ["Eric", "##sson", "'", "s", "6", "##G", "vision", ",", "first", "introduced", "in", "2020", ",", "remains", "pivotal", "for", "transforming", "business", "and", "society", "in", "the", "203", "##0s", "through", "secure", ",", "efficient", ",", "and", "sustainable", "communication", "services", "."], "tags_knowledge": ["B", "I", "O", "O", "B", "I", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "O", "O", "O"]}
         
     | 
| 8 | 
         
            +
            {"tokens": ["As", "6", "##G", "development", "progresses", "into", "a", "more", "concrete", "phase", "of", "regulation", "and", "standard", "##ization", "we", "are", "looking", "for", "researchers", "that", "would", "like", "to", "join", "us", ",", "co", "-", "creating", "a", "c", "##y", "##ber", "-", "physical", "world"], "tags_knowledge": ["O", "B", "I", "O", "O", "O", "O", "O", "O", "O", "O", "B", "O", "B", "I", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
         
     | 
| 9 | 
         
            +
            {"tokens": ["Within", "Eric", "##sson", ",", "Eric", "##sson", "Research", "develops", "new", "communication", "solutions", "and", "standards", "which", "have", "made", "Eric", "##sson", "the", "industry", "leader", "in", "defining", "five", "generations", "of", "mobile", "communication", "."], "tags_knowledge": ["O", "B", "I", "O", "B", "I", "O", "O", "O", "B", "O", "O", "O", "O", "O", "O", "B", "I", "O", "O", "O", "O", "O", "O", "O", "O", "B", "O", "O"]}
         
     | 
| 10 | 
         
            +
            {"tokens": ["As", "we", "gear", "up", "for", "the", "6th", "generation", ",", "we", "would", "like", "to", "fully", "embrace", "and", "utilize", "cloud", "native", "principles", ",", "h", "##yper", "##sca", "##lers", "and", "internal", "cloud", "infrastructure", "in", "our", "research", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "B", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "I", "O", "O", "O", "O", "O", "O", "O", "O", "B", "O", "O", "O", "O"]}
         
     | 
| 11 | 
         
            +
            {"tokens": ["We", "are", "now", "looking", "for", "a", "M", "##L", "##O", "##ps", "research", "engineer", "to", "develop", "and", "support", "our", "work", "##flow", "##s", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "B", "I", "I", "I", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
         
     | 
| 12 | 
         
            +
            {"tokens": ["In", "this", "role", ",", "you", "will"], "tags_knowledge": ["O", "O", "O", "O", "O", "O"]}
         
     | 
| 13 | 
         
            +
            {"tokens": ["Con", "##tri", "##but", "##e", "to", "the", "direction", "and", "implementation", "of", "M", "##L", "-", "based", "ways", "of", "working"], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "I", "O", "O", "O", "O", "O"]}
         
     | 
| 14 | 
         
            +
            {"tokens": ["Study", ",", "design", "and", "develop", "work", "##flow", "##s", "and", "solutions", "for", "AI", "based", "R", "&", "D"], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "O", "O", "O", "O"]}
         
     | 
| 15 | 
         
            +
            {"tokens": ["Work", "across", "internal", "com", "##pute", "and", "external", "cloud", "platforms"], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "B", "O"]}
         
     | 
| 16 | 
         
            +
            {"tokens": ["Working", "closely", "with", "researchers", "driving", "6", "##G", "standard", "##ization"], "tags_knowledge": ["O", "O", "O", "O", "O", "B", "I", "B", "I"]}
         
     | 
| 17 | 
         
            +
            {"tokens": ["Jo", "##in", "our", "Team"], "tags_knowledge": ["O", "O", "O", "O"]}
         
     | 
| 18 | 
         
            +
            {"tokens": ["Qualification", "##s"], "tags_knowledge": ["O", "O"]}
         
     | 
| 19 | 
         
            +
            {"tokens": ["MS", "##c", "in", "Data", "Science", "or", "related", "field", ",", "or", "have", "equivalent", "practical", "experience"], "tags_knowledge": ["B", "I", "O", "B", "I", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
         
     | 
| 20 | 
         
            +
            {"tokens": ["Technical", "skills", "and", "/", "or", "professional", "experience", ",", "particularly", "in", ":"], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
         
     | 
| 21 | 
         
            +
            {"tokens": ["Programming", "in", "various", "languages", "(", "Python", ",", "Go", ",", "etc", ")"], "tags_knowledge": ["O", "O", "O", "O", "O", "B", "O", "B", "O", "O", "O"]}
         
     | 
| 22 | 
         
            +
            {"tokens": ["M", "##L", "##O", "##ps", "technologies", "and", "tool", "##ing", "(", "e", ".", "g", ".", "M", "##LF", "##low", ",", "Ku", "##be", "##flow", ")"], "tags_knowledge": ["B", "I", "I", "I", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "I", "I", "O", "B", "I", "I", "O"]}
         
     | 
| 23 | 
         
            +
            {"tokens": ["Di", "##sp", "##atch", "##ing", "and", "computational", "Python", "packages", "(", "H", "##yd", "##ra", ",", "n", "##ump", "##y", ",", "Ten", "##sor", "##F", "##low", ",", "etc", ".", ")"], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "B", "O", "O", "B", "I", "I", "O", "B", "I", "I", "O", "B", "I", "I", "I", "O", "O", "O", "O"]}
         
     | 
| 24 | 
         
            +
            {"tokens": ["Dev", "##O", "##ps", "and", "C", "##I", "/", "CD", "experience", ",", "runner", "deployment", "&", "management", ",", "pipeline", "creation", ",", "testing", "etc", ".", "for", "valid", "##ating", "M", "##L", "-", "driven", "code"], "tags_knowledge": ["B", "I", "I", "O", "B", "I", "O", "B", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "I", "O", "O", "O", "O"]}
         
     | 
| 25 | 
         
            +
            {"tokens": ["F", "##ami", "##lia", "##rity", "in", "the", "following", "is", "a", "plus", ":"], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
         
     | 
| 26 | 
         
            +
            {"tokens": ["M", "##L", "framework", "##s", "(", "P", "##y", "##T", "##or", "##ch", ",", "Ten", "##sor", "##F", "##low", ",", "or", "Jax", ")"], "tags_knowledge": ["B", "I", "O", "O", "O", "B", "I", "I", "I", "I", "O", "B", "I", "I", "I", "O", "O", "B", "O"]}
         
     | 
| 27 | 
         
            +
            {"tokens": ["Con", "##tain", "##ers", "technologies", "(", "engines", ",", "orchestra", "##tion", "tools", "and", "framework", "##s", "such", "as", "Dock", "##er", ",", "Ka", "##nik", "##o", ",", "Ku", "##ber", "##net", "##es", ",", "He", "##lm", ",", "etc", ".", ")"], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "I", "O", "O", "O", "O", "O", "O", "B", "I", "O", "B", "I", "I", "O", "B", "I", "I", "I", "O", "B", "I", "O", "O", "O", "O"]}
         
     | 
| 28 | 
         
            +
            {"tokens": ["Cloud", "ecosystems", "along", "with", "the", "respective", "infrastructure", ",", "in", "particular", "A", "##WS"], "tags_knowledge": ["B", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "I"]}
         
     | 
| 29 | 
         
            +
            {"tokens": ["Infrastructure", "management", "(", "An", "##sible", ",", "Terra", "##form", ",", "etc", ".", ")"], "tags_knowledge": ["O", "O", "O", "B", "I", "O", "B", "I", "O", "O", "O", "O"]}
         
     | 
| 30 | 
         
            +
            {"tokens": ["Team", "skills", "is", "a", "necessity", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O"]}
         
     | 
| 31 | 
         
            +
            {"tokens": ["Daily", "cross", "-", "functional", "collaboration", "and", "interaction", "with", "other", "skilled", "researchers", "are", "the", "basis", "for", "our", "ways", "of", "working", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
         
     | 
| 32 | 
         
            +
            {"tokens": ["You", "should", "enjoy", "working", "with", "people", "having", "diverse", "backgrounds", "and", "competence", "##s", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
         
     | 
| 33 | 
         
            +
            {"tokens": ["It", "is", "important", "that", "you", "have", "strong", "personal", "drive", "and", "a", "strong", "focus", "on", "the", "tasks", "at", "hand", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
         
     | 
| 34 | 
         
            +
            {"tokens": ["A", "##bility", "to", "translate", "high", "-", "level", "objectives", "into", "detailed", "tasks", "and", "action", "##able", "steps", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
         
     | 
| 35 | 
         
            +
            {"tokens": ["Location", ":", "Lu", "##le", "##\u00e5", ",", "Sweden"], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O"]}
         
     | 
    	
        few-shot-extract.py
    CHANGED
    
    | 
         @@ -1,6 +1,7 @@ 
     | 
|
| 1 | 
         
             
            import requests
         
     | 
| 2 | 
         
             
            import os
         
     | 
| 3 | 
         
            -
            repo_dir = os. 
     | 
| 
         | 
|
| 4 | 
         | 
| 5 | 
         
             
            def show_examples(n = 10):
         
     | 
| 6 | 
         | 
| 
         @@ -10,16 +11,14 @@ def show_examples(n = 10): 
     | 
|
| 10 | 
         
             
                if response.status_code == 200:
         
     | 
| 11 | 
         | 
| 12 | 
         
             
                    data = response.json()
         
     | 
| 13 | 
         
            -
             
     | 
| 14 | 
         
            -
             
     | 
| 15 | 
         
            -
             
     | 
| 16 | 
         
            -
             
     | 
| 17 | 
         
            -
             
     | 
| 18 | 
         
            -
                         
     | 
| 19 | 
         
            -
                            file.write(f' 
     | 
| 20 | 
         
            -
                            file.write(f' 
     | 
| 21 | 
         
            -
                            file.write(f'Skill Labels: {str(skill_labels)}\n')
         
     | 
| 22 | 
         
            -
                            file.write(f'Knowledge Labels: {str(knowledge_labels)}\n')
         
     | 
| 23 | 
         
             
                            file.write('\n')
         
     | 
| 24 | 
         | 
| 25 | 
         | 
| 
         | 
|
| 1 | 
         
             
            import requests
         
     | 
| 2 | 
         
             
            import os
         
     | 
| 3 | 
         
            +
            repo_dir = os.getcwd()
         
     | 
| 4 | 
         
            +
            print(repo_dir)
         
     | 
| 5 | 
         | 
| 6 | 
         
             
            def show_examples(n = 10):
         
     | 
| 7 | 
         | 
| 
         | 
|
| 11 | 
         
             
                if response.status_code == 200:
         
     | 
| 12 | 
         | 
| 13 | 
         
             
                    data = response.json()
         
     | 
| 14 | 
         
            +
             
     | 
| 15 | 
         
            +
                    tags_knowledge = [str(a['row']['tags_knowledge']) for a in data['rows']]
         
     | 
| 16 | 
         
            +
                    tokens = [str(a['row']['tokens']) for a in data['rows']]
         
     | 
| 17 | 
         
            +
             
     | 
| 18 | 
         
            +
                    with open(f"{repo_dir}/few_shot.txt", 'w') as file:
         
     | 
| 19 | 
         
            +
                        for i in range(n):
         
     | 
| 20 | 
         
            +
                            file.write(f'tags_knowledge: {tags_knowledge[i]}\n')
         
     | 
| 21 | 
         
            +
                            file.write(f'tokens: {tokens[i]}\n')
         
     | 
| 
         | 
|
| 
         | 
|
| 22 | 
         
             
                            file.write('\n')
         
     | 
| 23 | 
         | 
| 24 | 
         | 
    	
        few_shot.txt
    ADDED
    
    | 
         @@ -0,0 +1,299 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O']
         
     | 
| 2 | 
         
            +
            Tokens: ['Senior', 'QA', 'Engineer', '(', 'm/f/d', ')', '<ORGANIZATION>']
         
     | 
| 3 | 
         
            +
             
     | 
| 4 | 
         
            +
            Tags Knowledge: ['O', 'O', 'O', 'O', 'O']
         
     | 
| 5 | 
         
            +
            Tokens: ['<ADDRESS>', '<ADDRESS>', '<ADDRESS>', '<ADDRESS>', '<LOCATION>']
         
     | 
| 6 | 
         
            +
             
     | 
| 7 | 
         
            +
            Tags Knowledge: ['O', 'O', 'O']
         
     | 
| 8 | 
         
            +
            Tokens: ['Date', 'posted:', '2021-07-14']
         
     | 
| 9 | 
         
            +
             
     | 
| 10 | 
         
            +
            Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O']
         
     | 
| 11 | 
         
            +
            Tokens: ['Likes:', '0', 'Dislikes:', '0', 'Love:', '0']
         
     | 
| 12 | 
         
            +
             
     | 
| 13 | 
         
            +
            Tags Knowledge: ['O', 'O']
         
     | 
| 14 | 
         
            +
            Tokens: ['Job', 'description:']
         
     | 
| 15 | 
         
            +
             
     | 
| 16 | 
         
            +
            Tags Knowledge: ['O', 'O']
         
     | 
| 17 | 
         
            +
            Tokens: ['Location', 'options:']
         
     | 
| 18 | 
         
            +
             
     | 
| 19 | 
         
            +
            Tags Knowledge: ['O', 'O', 'O', 'O', 'O']
         
     | 
| 20 | 
         
            +
            Tokens: ['Remote', 'Visa', 'sponsor', 'Paid', 'relocation']
         
     | 
| 21 | 
         
            +
             
     | 
| 22 | 
         
            +
            Tags Knowledge: ['O', 'O']
         
     | 
| 23 | 
         
            +
            Tokens: ['Job', 'type:']
         
     | 
| 24 | 
         
            +
             
     | 
| 25 | 
         
            +
            Tags Knowledge: ['O']
         
     | 
| 26 | 
         
            +
            Tokens: ['Full-time']
         
     | 
| 27 | 
         
            +
             
     | 
| 28 | 
         
            +
            Tags Knowledge: ['O', 'O']
         
     | 
| 29 | 
         
            +
            Tokens: ['Experience', 'level:']
         
     | 
| 30 | 
         
            +
             
     | 
| 31 | 
         
            +
            Tags Knowledge: ['O']
         
     | 
| 32 | 
         
            +
            Tokens: ['Senior']
         
     | 
| 33 | 
         
            +
             
     | 
| 34 | 
         
            +
            Tags Knowledge: ['O']
         
     | 
| 35 | 
         
            +
            Tokens: ['Role:']
         
     | 
| 36 | 
         
            +
             
     | 
| 37 | 
         
            +
            Tags Knowledge: ['O', 'O']
         
     | 
| 38 | 
         
            +
            Tokens: ['QA/Test', 'Developer']
         
     | 
| 39 | 
         
            +
             
     | 
| 40 | 
         
            +
            Tags Knowledge: ['O']
         
     | 
| 41 | 
         
            +
            Tokens: ['Industry:']
         
     | 
| 42 | 
         
            +
             
     | 
| 43 | 
         
            +
            Tags Knowledge: ['B', 'I', 'I', 'B', 'I', 'B', 'I']
         
     | 
| 44 | 
         
            +
            Tokens: ['Business', 'to', 'Business', 'Information', 'Technology', 'Web', 'Technology']
         
     | 
| 45 | 
         
            +
             
     | 
| 46 | 
         
            +
            Tags Knowledge: ['O', 'O']
         
     | 
| 47 | 
         
            +
            Tokens: ['Company', 'size:']
         
     | 
| 48 | 
         
            +
             
     | 
| 49 | 
         
            +
            Tags Knowledge: ['O', 'O']
         
     | 
| 50 | 
         
            +
            Tokens: ['501-1k', 'people']
         
     | 
| 51 | 
         
            +
             
     | 
| 52 | 
         
            +
            Tags Knowledge: ['O', 'O']
         
     | 
| 53 | 
         
            +
            Tokens: ['Company', 'type:']
         
     | 
| 54 | 
         
            +
             
     | 
| 55 | 
         
            +
            Tags Knowledge: ['O']
         
     | 
| 56 | 
         
            +
            Tokens: ['Private']
         
     | 
| 57 | 
         
            +
             
     | 
| 58 | 
         
            +
            Tags Knowledge: ['O']
         
     | 
| 59 | 
         
            +
            Tokens: ['Technologies']
         
     | 
| 60 | 
         
            +
             
     | 
| 61 | 
         
            +
            Tags Knowledge: ['B', 'B', 'B', 'B', 'B']
         
     | 
| 62 | 
         
            +
            Tokens: ['docker', 'agile', 'selenium', 'circleci', 'jenkins']
         
     | 
| 63 | 
         
            +
             
     | 
| 64 | 
         
            +
            Tags Knowledge: ['O', 'O']
         
     | 
| 65 | 
         
            +
            Tokens: ['Job', 'description']
         
     | 
| 66 | 
         
            +
             
     | 
| 67 | 
         
            +
            Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
         
     | 
| 68 | 
         
            +
            Tokens: ['In', 'order', 'to', 'support', 'our', 'ongoing', 'international', 'growth', 'we', 'are', 'looking', 'for', 'a', 'Senior', 'QA', 'Engineer', 'to', 'join', 'our', 'Engineering', 'department', '.']
         
     | 
| 69 | 
         
            +
             
     | 
| 70 | 
         
            +
            Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'O', 'O', 'O', 'O', 'O', 'O']
         
     | 
| 71 | 
         
            +
            Tokens: ['You', 'will', 'be', 'working', 'in', 'an', 'end-to-end', 'cross-functional', 'team', 'being', 'responsible', 'for', 'implementing', 'and', 'promoting', 'all', 'QA', 'relevant', 'topics', 'on', 'team', 'level', '.']
         
     | 
| 72 | 
         
            +
             
     | 
| 73 | 
         
            +
            Tags Knowledge: ['O']
         
     | 
| 74 | 
         
            +
            Tokens: ['Responsibilities']
         
     | 
| 75 | 
         
            +
             
     | 
| 76 | 
         
            +
            Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O']
         
     | 
| 77 | 
         
            +
            Tokens: ['Design', 'and', 'implement', 'complex', 'end-to-end', 'tests', '.']
         
     | 
| 78 | 
         
            +
             
     | 
| 79 | 
         
            +
            Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'I', 'I', 'I', 'I', 'I', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
         
     | 
| 80 | 
         
            +
            Tokens: ['Work', 'hands-on', 'together', 'with', 'the', 'other', 'engineers', 'within', 'the', 'Agile', 'team', '-', 'to', 'ensure', 'continuous', 'quality', 'delivery', 'of', 'automated', 'acceptance', 'API', 'and', 'performance', 'tests', '-', 'while', 'constantly', 'collaborating', 'with', 'the', 'QA', 'Engineers', 'of', 'the', 'other', 'teams', '.']
         
     | 
| 81 | 
         
            +
             
     | 
| 82 | 
         
            +
            Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
         
     | 
| 83 | 
         
            +
            Tokens: ['Own', 'a', 'thought-leadership', 'influence', 'regarding', 'QA', 'relevant', 'topics', 'within', 'the', 'Agile', 'team', '.']
         
     | 
| 84 | 
         
            +
             
     | 
| 85 | 
         
            +
            Tags Knowledge: ['O']
         
     | 
| 86 | 
         
            +
            Tokens: ['Requirements']
         
     | 
| 87 | 
         
            +
             
     | 
| 88 | 
         
            +
            Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'O', 'B', 'O', 'B', 'O', 'B', 'O', 'O', 'O', 'B', 'I', 'O', 'B', 'B', 'O', 'O']
         
     | 
| 89 | 
         
            +
            Tokens: ['At', 'least', '5', 'years', 'of', 'combined', 'experience', 'in', 'Java', 'or', 'Kotlin', 'and', 'JavaScript', 'or', 'TypeScript', 'programming', 'and', 'related', 'test', 'frameworks', '(', 'Selenium', 'TestCafe', 'etc.)', '.']
         
     | 
| 90 | 
         
            +
             
     | 
| 91 | 
         
            +
            Tags Knowledge: ['O', 'O', 'O', 'B', 'I', 'O', 'B', 'I', 'O']
         
     | 
| 92 | 
         
            +
            Tokens: ['Good', 'understanding', 'of', 'Agile', 'methodologies', 'and', 'Continuous', 'Delivery', '.']
         
     | 
| 93 | 
         
            +
             
     | 
| 94 | 
         
            +
            Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
         
     | 
| 95 | 
         
            +
            Tokens: ['Experience', 'in', 'testing', 'applications', 'on', 'every', 'level', 'of', 'the', 'testing', 'pyramid', '.']
         
     | 
| 96 | 
         
            +
             
     | 
| 97 | 
         
            +
            Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
         
     | 
| 98 | 
         
            +
            Tokens: ['Great', 'communicator', 'being', 'able', 'to', 'relate', 'to', 'the', 'different', 'challenges', 'that', 'developers', 'product', 'managers', 'and', 'other', 'stakeholders', 'within', 'the', 'engineering', 'department', 'face', '.']
         
     | 
| 99 | 
         
            +
             
     | 
| 100 | 
         
            +
            Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'O']
         
     | 
| 101 | 
         
            +
            Tokens: ['Experience', 'in', 'working', 'on', 'a', 'cloud-based', 'application', 'running', 'on', 'Docker', '.']
         
     | 
| 102 | 
         
            +
             
     | 
| 103 | 
         
            +
            Tags Knowledge: ['O', 'B', 'I', 'I', 'I', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
         
     | 
| 104 | 
         
            +
            Tokens: ['A', 'degree', 'in', 'Computer', 'Science', 'or', 'related', 'fields', 'or', 'equivalent', 'practical', 'experience', '.']
         
     | 
| 105 | 
         
            +
             
     | 
| 106 | 
         
            +
            Tags Knowledge: ['O', 'O', 'O', 'O', 'B', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'O', 'O', 'O', 'O']
         
     | 
| 107 | 
         
            +
            Tokens: ['Experience', 'in', 'working', 'with', 'CircleCI', 'pipelines', 'on', 'running', 'tests', 'automatically', 'prior', 'to', 'the', 'deployment;', 'Jenkins', 'is', 'a', 'plus', '.']
         
     | 
| 108 | 
         
            +
             
     | 
| 109 | 
         
            +
            Tags Knowledge: ['B', 'I', 'I', 'I', 'O', 'O', 'O', 'O', 'O']
         
     | 
| 110 | 
         
            +
            Tokens: ['Performance', 'and', 'security', 'testing', 'experience', 'is', 'a', 'plus', '.']
         
     | 
| 111 | 
         
            +
             
     | 
| 112 | 
         
            +
            Tags Knowledge: ['O', 'O', 'O']
         
     | 
| 113 | 
         
            +
            Tokens: ['What', 'we', 'offer']
         
     | 
| 114 | 
         
            +
             
     | 
| 115 | 
         
            +
            Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
         
     | 
| 116 | 
         
            +
            Tokens: ['We', 'keep', 'things', 'open', 'agile', 'and', 'communicative', '.']
         
     | 
| 117 | 
         
            +
             
     | 
| 118 | 
         
            +
            Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
         
     | 
| 119 | 
         
            +
            Tokens: ['It', 'is', 'all', 'based', 'on', 'trust', 'not', 'micromanaging', '.']
         
     | 
| 120 | 
         
            +
             
     | 
| 121 | 
         
            +
            Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
         
     | 
| 122 | 
         
            +
            Tokens: ['The', 'whole', 'department', 'is', 'located', 'together', 'in', 'one', 'office', 'in', 'beautiful', '<LOCATION>', 'however', 'due', 'to', 'the', 'current', 'situation', 'we', 'work', 'and', 'onboard', '100%', 'remotely', 'to', 'keep', 'our', 'employees', 'safe', '.']
         
     | 
| 123 | 
         
            +
             
     | 
| 124 | 
         
            +
            Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
         
     | 
| 125 | 
         
            +
            Tokens: ['Our', 'team', 'members', 'are', 'self-organized', 'within', 'their', 'teams', 'working', 'on', 'independent', 'projects', 'or', 'closely', 'with', 'Product', 'Leads', 'developers', 'and', 'UX', 'designers', '.']
         
     | 
| 126 | 
         
            +
             
     | 
| 127 | 
         
            +
            Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
         
     | 
| 128 | 
         
            +
            Tokens: ['We', 'value', 'your', 'thoughts', 'and', 'ideas', 'and', 'will', 'give', 'you', 'the', 'freedom', 'to', 'push', 'and', 'implement', 'them!']
         
     | 
| 129 | 
         
            +
             
     | 
| 130 | 
         
            +
            Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
         
     | 
| 131 | 
         
            +
            Tokens: ['We', 'offer', 'competitive', 'salaries', 'and', 'support', 'personal', 'growth', 'with', 'functional', 'in-house', 'coaching', 'and', 'a', 'personal', 'development', 'budget', 'that', 'includes', 'three', 'days', 'off', 'per', 'year', '.']
         
     | 
| 132 | 
         
            +
             
     | 
| 133 | 
         
            +
            Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
         
     | 
| 134 | 
         
            +
            Tokens: ['You', 'will', 'gain', '–', 'and', 'share', '–', 'knowledge', 'during', 'recurring', 'learning', 'groups', 'jours', 'fixes', 'and', 'our', 'annual', 'Code', 'Camp', '.']
         
     | 
| 135 | 
         
            +
             
     | 
| 136 | 
         
            +
            Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
         
     | 
| 137 | 
         
            +
            Tokens: ['You', 'are', 'free', 'to', 'use', 'the', 'OS', 'of', 'your', 'choice', 'the', 'tooling', 'you', 'are', 'comfortable', 'with', 'and', 'set', 'up', 'your', 'workspace', 'the', 'way', 'you', 'like', 'it', '.']
         
     | 
| 138 | 
         
            +
             
     | 
| 139 | 
         
            +
            Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
         
     | 
| 140 | 
         
            +
            Tokens: ['<ORGANIZATION>', 'will', 'support', 'you', 'with', 'all', 'the', 'necessary', 'office', 'equipment', 'even', 'when', 'working', 'from', 'home!']
         
     | 
| 141 | 
         
            +
             
     | 
| 142 | 
         
            +
            Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
         
     | 
| 143 | 
         
            +
            Tokens: ['We', 'get', 'that', 'balancing', 'a', 'family', 'and', 'work', 'can', 'be', 'a', 'challenge', 'so', 'everyone', 'gets', 'flexible', 'working', 'hours', 'and', '30', 'days', 'of', 'holidays', 'per', 'year', '.']
         
     | 
| 144 | 
         
            +
             
     | 
| 145 | 
         
            +
            Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
         
     | 
| 146 | 
         
            +
            Tokens: ['Moreover', '<ORGANIZATION>', 'will', 'support', 'you', 'in', 'case', 'of', 'relocation', 'and', 'visa', 'application', '.']
         
     | 
| 147 | 
         
            +
             
     | 
| 148 | 
         
            +
            Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
         
     | 
| 149 | 
         
            +
            Tokens: ['Note:', 'We', 'support', 'your', 'relocation', 'but', 'due', 'to', 'tax', 'reason', 'you’d', 'be', 'required', 'to', 'be', 'resident', 'in', 'one', 'of', 'the', 'following', 'countries:', '<LOCATION>', '<LOCATION>', '<LOCATION>', '<LOCATION>', '<LOCATION>', '<LOCATION>', '.']
         
     | 
| 150 | 
         
            +
             
     | 
| 151 | 
         
            +
            Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
         
     | 
| 152 | 
         
            +
            Tokens: ['Visa', 'support', 'can', 'currently', 'be', 'offered', 'only', 'for', '<LOCATION>', '.']
         
     | 
| 153 | 
         
            +
             
     | 
| 154 | 
         
            +
            Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
         
     | 
| 155 | 
         
            +
            Tokens: ['*Do', 'I', 'need', 'to', 'meet', 'all', 'the', 'requirements', 'to', 'apply?']
         
     | 
| 156 | 
         
            +
             
     | 
| 157 | 
         
            +
            Tags Knowledge: ['O']
         
     | 
| 158 | 
         
            +
            Tokens: ['*']
         
     | 
| 159 | 
         
            +
             
     | 
| 160 | 
         
            +
            Tags Knowledge: ['O']
         
     | 
| 161 | 
         
            +
            Tokens: ['Studies']
         
     | 
| 162 | 
         
            +
             
     | 
| 163 | 
         
            +
            Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
         
     | 
| 164 | 
         
            +
            Tokens: ['by', 'several', 'different', 'sources', 'have', 'shown', 'that', 'on', 'average', 'men', 'will', 'apply', 'for', 'a', 'job', 'if', 'they', 'meet', '60%', 'of', 'the', 'application', 'requirements', '.']
         
     | 
| 165 | 
         
            +
             
     | 
| 166 | 
         
            +
            Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
         
     | 
| 167 | 
         
            +
            Tokens: ['In', 'contrast', 'women/non-binary', 'people', 'will', 'seek', 'to', 'match', 'a', 'much', 'higher', 'percentage', 'of', 'the', 'requirements', 'before', 'applying', '.']
         
     | 
| 168 | 
         
            +
             
     | 
| 169 | 
         
            +
            Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
         
     | 
| 170 | 
         
            +
            Tokens: ['We', 'encourage', 'everyone', 'to', 'apply', 'and', 'give', 'us', 'a', 'chance', 'to', 'evaluate', 'your', 'skills', 'and', 'experience', '.']
         
     | 
| 171 | 
         
            +
             
     | 
| 172 | 
         
            +
            Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
         
     | 
| 173 | 
         
            +
            Tokens: ['We', 'are', 'all', 'learning', 'on', 'the', 'job', 'and', 'although', 'the', 'listing', 'above', 'has', 'been', 'carefully', 'compiled', 'we', 'are', 'also', 'open-minded', 'and', 'interested', 'to', 'hear', 'about', 'the', 'value', 'you', 'can', 'bring', 'to', 'the', 'role', 'and', '<ORGANIZATION>', '.']
         
     | 
| 174 | 
         
            +
             
     | 
| 175 | 
         
            +
            Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
         
     | 
| 176 | 
         
            +
            Tokens: ['*How', 'can', 'I', 'demonstrate', 'that', 'I', 'have', 'particular', 'needs', 'in', 'the', 'application', 'process?']
         
     | 
| 177 | 
         
            +
             
     | 
| 178 | 
         
            +
            Tags Knowledge: ['O']
         
     | 
| 179 | 
         
            +
            Tokens: ['*']
         
     | 
| 180 | 
         
            +
             
     | 
| 181 | 
         
            +
            Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
         
     | 
| 182 | 
         
            +
            Tokens: ['For', 'people', 'living', 'with', 'disabilities', 'chronic', 'illnesses', 'or', 'neurodiversity', 'adjustments', 'and', 'support', 'can', 'make', 'a', 'decisive', 'difference', 'in', 'the', 'application', 'process', '.']
         
     | 
| 183 | 
         
            +
             
     | 
| 184 | 
         
            +
            Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
         
     | 
| 185 | 
         
            +
            Tokens: ['If', 'you', 'need', 'any', 'specific', 'accommodations', '(', 'tools', 'time', 'etc.', ')', 'and', 'feel', 'comfortable', 'disclosing', 'this', 'please', 'let', 'us', 'know', '.']
         
     | 
| 186 | 
         
            +
             
     | 
| 187 | 
         
            +
            Tags Knowledge: ['O', 'O']
         
     | 
| 188 | 
         
            +
            Tokens: ['Job', 'benefits:']
         
     | 
| 189 | 
         
            +
             
     | 
| 190 | 
         
            +
            Tags Knowledge: ['O', 'O', 'O']
         
     | 
| 191 | 
         
            +
            Tokens: ['Flexible', 'working', 'hours']
         
     | 
| 192 | 
         
            +
             
     | 
| 193 | 
         
            +
            Tags Knowledge: ['O', 'O']
         
     | 
| 194 | 
         
            +
            Tokens: ['Flat', 'hierarchies']
         
     | 
| 195 | 
         
            +
             
     | 
| 196 | 
         
            +
            Tags Knowledge: ['O', 'O', 'O', 'O', 'O']
         
     | 
| 197 | 
         
            +
            Tokens: ['Mentoring', '&', 'personal', 'development', 'program']
         
     | 
| 198 | 
         
            +
             
     | 
| 199 | 
         
            +
            Tags Knowledge: ['O', 'O', 'O', 'O', 'O']
         
     | 
| 200 | 
         
            +
            Tokens: ['Fruits', '&', 'drinks', 'for', 'free']
         
     | 
| 201 | 
         
            +
             
     | 
| 202 | 
         
            +
            Tags Knowledge: ['O', 'O', 'O']
         
     | 
| 203 | 
         
            +
            Tokens: ['Excellent', 'transport', 'connections']
         
     | 
| 204 | 
         
            +
             
     | 
| 205 | 
         
            +
            Tags Knowledge: ['O', 'O']
         
     | 
| 206 | 
         
            +
            Tokens: ['Sports', 'offers']
         
     | 
| 207 | 
         
            +
             
     | 
| 208 | 
         
            +
            Tags Knowledge: ['O', 'O']
         
     | 
| 209 | 
         
            +
            Tokens: ['Subsidised', 'lunches']
         
     | 
| 210 | 
         
            +
             
     | 
| 211 | 
         
            +
            Tags Knowledge: ['O', 'O', 'O', 'O']
         
     | 
| 212 | 
         
            +
            Tokens: ['30', 'days', 'of', 'holidays']
         
     | 
| 213 | 
         
            +
             
     | 
| 214 | 
         
            +
            Tags Knowledge: ['O', 'O']
         
     | 
| 215 | 
         
            +
            Tokens: ['Child-care', 'support']
         
     | 
| 216 | 
         
            +
             
     | 
| 217 | 
         
            +
            Tags Knowledge: ['O', 'O', 'O', 'O']
         
     | 
| 218 | 
         
            +
            Tokens: ['30', 'days', 'of', 'holiday']
         
     | 
| 219 | 
         
            +
             
     | 
| 220 | 
         
            +
            Tags Knowledge: ['O', 'O']
         
     | 
| 221 | 
         
            +
            Tokens: ['Company', 'description:']
         
     | 
| 222 | 
         
            +
             
     | 
| 223 | 
         
            +
            Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
         
     | 
| 224 | 
         
            +
            Tokens: ['<ORGANIZATION>', 'is', 'the', 'leading', 'SaaS-based', 'business', 'process', 'management', 'application', 'suite', 'in', 'the', 'world', '.']
         
     | 
| 225 | 
         
            +
             
     | 
| 226 | 
         
            +
            Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
         
     | 
| 227 | 
         
            +
            Tokens: ['<ORGANIZATION>', 'enables', 'organisations', 'to', 'keep', 'up', 'with', 'the', 'pace', 'volume', 'and', 'complexity', 'of', 'change', '.']
         
     | 
| 228 | 
         
            +
             
     | 
| 229 | 
         
            +
            Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
         
     | 
| 230 | 
         
            +
            Tokens: ['Our', 'Business', 'Transformation', 'Suite', 'is', 'the', 'smarter', 'way', 'to', 'continuously', 'translate', 'between', 'strategy', 'and', 'execution', '.']
         
     | 
| 231 | 
         
            +
             
     | 
| 232 | 
         
            +
            Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
         
     | 
| 233 | 
         
            +
            Tokens: ['With', '<ORGANIZATION>', 'companies', 'of', 'all', 'sizes', 'can', 'document', 'automate', 'and', 'analyse', 'processes', 'which', 'allows', 'them', 'to', 'make', 'smarter', 'business', 'decisions', '.']
         
     | 
| 234 | 
         
            +
             
     | 
| 235 | 
         
            +
            Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
         
     | 
| 236 | 
         
            +
            Tokens: ['Headquartered', 'in', '<LOCATION>', 'with', 'offices', 'in', 'the', '<LOCATION>', '<LOCATION>', '<LOCATION>', '<LOCATION>', '<LOCATION>', 'and', '<LOCATION>', '<ORGANIZATION>', 'serves', 'more', 'than', '1,300', 'customers', 'around', 'the', 'globe', 'across', 'all', 'industries', 'and', 'employs', '300', 'employees', 'globally', '.']
         
     | 
| 237 | 
         
            +
             
     | 
| 238 | 
         
            +
            Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
         
     | 
| 239 | 
         
            +
            Tokens: ['Are', 'you', 'interested', 'in', 'joining', 'one', 'of', 'the', 'world’s', 'leading', 'Business', 'Process', 'Management', 'companies?', 'As', 'we', 'expand', 'our', 'presence', 'into', 'new', 'markets', 'across', 'the', 'globe', 'we', 'are', 'looking', 'to', 'add', 'to', 'our', 'team!', 'across', 'all', 'departments.']
         
     | 
| 240 | 
         
            +
             
     | 
| 241 | 
         
            +
            Tags Knowledge: ['O', 'O', 'O']
         
     | 
| 242 | 
         
            +
            Tokens: ['Cloud', 'DevOps', 'Engineer']
         
     | 
| 243 | 
         
            +
             
     | 
| 244 | 
         
            +
            Tags Knowledge: ['O', 'O', 'O', 'O', 'O']
         
     | 
| 245 | 
         
            +
            Tokens: ['<ORGANIZATION>', '<ORGANIZATION>', '<ORGANIZATION>', '<ORGANIZATION>', '<ORGANIZATION>']
         
     | 
| 246 | 
         
            +
             
     | 
| 247 | 
         
            +
            Tags Knowledge: ['O', 'O', 'O', 'O', 'O']
         
     | 
| 248 | 
         
            +
            Tokens: ['<ADDRESS>', '<ADDRESS>', '<LOCATION>', '-', '<LOCATION>']
         
     | 
| 249 | 
         
            +
             
     | 
| 250 | 
         
            +
            Tags Knowledge: ['O', 'O', 'O']
         
     | 
| 251 | 
         
            +
            Tokens: ['Date', 'posted:', '2021-01-21']
         
     | 
| 252 | 
         
            +
             
     | 
| 253 | 
         
            +
            Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O']
         
     | 
| 254 | 
         
            +
            Tokens: ['Likes:', '0', 'Dislikes:', '0', 'Love:', '0']
         
     | 
| 255 | 
         
            +
             
     | 
| 256 | 
         
            +
            Tags Knowledge: ['O', 'O']
         
     | 
| 257 | 
         
            +
            Tokens: ['Job', 'description:']
         
     | 
| 258 | 
         
            +
             
     | 
| 259 | 
         
            +
            Tags Knowledge: ['O', 'O']
         
     | 
| 260 | 
         
            +
            Tokens: ['Job', 'type:']
         
     | 
| 261 | 
         
            +
             
     | 
| 262 | 
         
            +
            Tags Knowledge: ['O']
         
     | 
| 263 | 
         
            +
            Tokens: ['Full-time']
         
     | 
| 264 | 
         
            +
             
     | 
| 265 | 
         
            +
            Tags Knowledge: ['O']
         
     | 
| 266 | 
         
            +
            Tokens: ['Role:']
         
     | 
| 267 | 
         
            +
             
     | 
| 268 | 
         
            +
            Tags Knowledge: ['O']
         
     | 
| 269 | 
         
            +
            Tokens: ['DevOps']
         
     | 
| 270 | 
         
            +
             
     | 
| 271 | 
         
            +
            Tags Knowledge: ['O']
         
     | 
| 272 | 
         
            +
            Tokens: ['Industry:']
         
     | 
| 273 | 
         
            +
             
     | 
| 274 | 
         
            +
            Tags Knowledge: ['B', 'I']
         
     | 
| 275 | 
         
            +
            Tokens: ['Financial', 'Services']
         
     | 
| 276 | 
         
            +
             
     | 
| 277 | 
         
            +
            Tags Knowledge: ['O', 'O']
         
     | 
| 278 | 
         
            +
            Tokens: ['Company', 'size:']
         
     | 
| 279 | 
         
            +
             
     | 
| 280 | 
         
            +
            Tags Knowledge: ['O', 'O']
         
     | 
| 281 | 
         
            +
            Tokens: ['10k+', 'people']
         
     | 
| 282 | 
         
            +
             
     | 
| 283 | 
         
            +
            Tags Knowledge: ['O', 'O']
         
     | 
| 284 | 
         
            +
            Tokens: ['Company', 'type:']
         
     | 
| 285 | 
         
            +
             
     | 
| 286 | 
         
            +
            Tags Knowledge: ['O']
         
     | 
| 287 | 
         
            +
            Tokens: ['Public']
         
     | 
| 288 | 
         
            +
             
     | 
| 289 | 
         
            +
            Tags Knowledge: ['O']
         
     | 
| 290 | 
         
            +
            Tokens: ['Technologies']
         
     | 
| 291 | 
         
            +
             
     | 
| 292 | 
         
            +
            Tags Knowledge: ['B', 'B', 'B']
         
     | 
| 293 | 
         
            +
            Tokens: ['cloud', 'java', 'amazon-web-services']
         
     | 
| 294 | 
         
            +
             
     | 
| 295 | 
         
            +
            Tags Knowledge: ['O', 'O']
         
     | 
| 296 | 
         
            +
            Tokens: ['Job', 'description']
         
     | 
| 297 | 
         
            +
             
     | 
| 298 | 
         
            +
            Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
         
     | 
| 299 | 
         
            +
            Tokens: ['As', 'a', 'member', 'of', 'our', 'Software', 'Engineering', 'Group', 'we', 'look', 'first', 'and', 'foremost', 'for', 'people', 'who', 'are', 'passionate', 'about', 'solving', 'business', 'problems', 'through', 'innovation', 'and', 'engineering', 'practices', '.']
         
     | 
    	
        llm-tagging.py
    CHANGED
    
    | 
         @@ -15,9 +15,9 @@ import sys 
     | 
|
| 15 | 
         
             
            from tabulate import tabulate
         
     | 
| 16 | 
         
             
            import spacy
         
     | 
| 17 | 
         
             
            import re
         
     | 
| 
         | 
|
| 18 | 
         | 
| 19 | 
         
             
            load_dotenv(".env")
         
     | 
| 20 | 
         
            -
             
     | 
| 21 | 
         
             
            nlp = spacy.load("en_core_web_sm")
         
     | 
| 22 | 
         | 
| 23 | 
         
             
            def split_text_recursively(text):
         
     | 
| 
         @@ -46,7 +46,6 @@ def tokenize_to_sent(path): 
     | 
|
| 46 | 
         
             
                for line in str_list:
         
     | 
| 47 | 
         
             
                    doc = nlp(line)
         
     | 
| 48 | 
         
             
                    for sent in doc.sents:
         
     | 
| 49 | 
         
            -
                        # print(f"{sent.text}")
         
     | 
| 50 | 
         
             
                        sents.append(sent.text)
         
     | 
| 51 | 
         | 
| 52 | 
         
             
                return sents
         
     | 
| 
         @@ -58,13 +57,15 @@ model = ChatOpenAI(temperature=0) 
     | 
|
| 58 | 
         | 
| 59 | 
         
             
            class TokenTaggingResult(BaseModel):
         
     | 
| 60 | 
         
             
                tokens: List[str]
         
     | 
| 61 | 
         
            -
                 
     | 
| 62 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 63 | 
         | 
| 64 | 
         | 
| 65 | 
         
             
            model = ChatOpenAI(model_name="gpt-4o", temperature=0.0, api_key=os.getenv('OPENAI_API_KEY'))
         
     | 
| 66 | 
         
             
            tokenizer = AutoTokenizer.from_pretrained("jjzha/jobbert_skill_extraction")
         
     | 
| 67 | 
         
            -
            parser = JsonOutputParser(pydantic_object= 
     | 
| 68 | 
         | 
| 69 | 
         
             
            # Definitions
         
     | 
| 70 | 
         | 
| 
         @@ -81,23 +82,20 @@ with open('few-shot.txt', 'r') as file: 
     | 
|
| 81 | 
         
             
                few_shot_examples = file.read()
         
     | 
| 82 | 
         | 
| 83 | 
         
             
            prompt = PromptTemplate(
         
     | 
| 84 | 
         
            -
                template="""You are an expert in tagging tokens with  
     | 
| 85 | 
         
            -
                Skill definition:{skill_definition}
         
     | 
| 86 | 
         
             
                Knowledge definition:{knowledge_definition}
         
     | 
| 87 | 
         
             
                Use the examples below to tag the input text into relevant knowledge or skills categories.\n{few_shot_examples}\n{format_instructions}\n{input}\n""",
         
     | 
| 88 | 
         
             
                input_variables=["input"],
         
     | 
| 89 | 
         
             
                partial_variables={"format_instructions": parser.get_format_instructions(),
         
     | 
| 90 | 
         
             
                                   "few_shot_examples": few_shot_examples,
         
     | 
| 91 | 
         
            -
             
     | 
| 92 | 
         
             
                                   "knowledge_definition": knowledge_definition},
         
     | 
| 93 | 
         
             
            )
         
     | 
| 94 | 
         | 
| 95 | 
         
            -
            def extract_tags(text: str, tokenize = True) ->  
     | 
| 96 | 
         | 
| 97 | 
         
             
                if tokenize:
         
     | 
| 98 | 
         
            -
             
     | 
| 99 | 
         
            -
                    inputs = tokenizer(text, return_tensors="pt")
         
     | 
| 100 | 
         
            -
                    tokens =  tokenizer.decode(inputs['input_ids'].squeeze()).split()[1:-1]
         
     | 
| 101 | 
         | 
| 102 | 
         
             
                prompt_and_model = prompt | model
         
     | 
| 103 | 
         
             
                output = prompt_and_model.invoke({"input": tokens})
         
     | 
| 
         @@ -105,90 +103,21 @@ def extract_tags(text: str, tokenize = True) -> TokenTaggingResult: 
     | 
|
| 105 | 
         
             
                return tokens, output
         
     | 
| 106 | 
         | 
| 107 | 
         | 
| 108 | 
         
            -
             
     | 
| 109 | 
         
            -
             
     | 
| 110 | 
         
            -
            mapping = {0: 'B', 1: 'I', 2: 'O'}
         
     | 
| 111 | 
         
            -
            token_skill_classifier = AutoModelForTokenClassification.from_pretrained("jjzha/jobbert_skill_extraction")
         
     | 
| 112 | 
         
            -
            token_knowledge_classifier = AutoModelForTokenClassification.from_pretrained("jjzha/jobbert_knowledge_extraction")
         
     | 
| 113 | 
         
            -
             
     | 
| 114 | 
         
            -
            def convert(text):
         
     | 
| 115 | 
         
            -
                inputs = tokenizer(text, return_tensors="pt")
         
     | 
| 116 | 
         
            -
             
     | 
| 117 | 
         
            -
                with torch.no_grad():
         
     | 
| 118 | 
         
            -
                    skill_outputs = token_skill_classifier(**inputs)
         
     | 
| 119 | 
         
            -
                    knowledge_outputs = token_knowledge_classifier(**inputs)
         
     | 
| 120 | 
         
            -
             
     | 
| 121 | 
         
            -
                decoded_tokens =  tokenizer.decode(inputs['input_ids'].squeeze()).split()[1:-1]
         
     | 
| 122 | 
         
            -
                skill_cls = skill_outputs.logits.argmax(dim=2).squeeze()[1:-1]
         
     | 
| 123 | 
         
            -
                knowledge_cls = knowledge_outputs.logits.argmax(dim=2).squeeze()[1:-1]
         
     | 
| 124 | 
         
            -
             
     | 
| 125 | 
         
            -
                skill_cls = [mapping[i.item()] for i in skill_cls]
         
     | 
| 126 | 
         
            -
                knowledge_cls = [mapping[i.item()] for i in knowledge_cls]
         
     | 
| 127 | 
         
            -
             
     | 
| 128 | 
         
            -
                if len(decoded_tokens) != len(skill_cls) or len(decoded_tokens) != len(knowledge_cls):
         
     | 
| 129 | 
         
            -
                    raise ValueError("Error: Length mismatch")
         
     | 
| 130 | 
         
            -
             
     | 
| 131 | 
         
            -
                return skill_cls, knowledge_cls, decoded_tokens
         
     | 
| 132 | 
         
            -
             
     | 
| 133 | 
         
            -
             
     | 
| 134 | 
         
            -
            from transformers import pipeline
         
     | 
| 135 | 
         
            -
            pipe = pipeline("token-classification", model="jjzha/jobbert_knowledge_extraction")
         
     | 
| 136 | 
         
            -
             
     | 
| 137 | 
         
            -
            def convert2(text):
         
     | 
| 138 | 
         
            -
                output = pipe(text)
         
     | 
| 139 | 
         
            -
                tokens = [i['word'] for i in output]
         
     | 
| 140 | 
         
            -
                skill_cls = [i['entity'] for i in output]
         
     | 
| 141 | 
         
            -
                knowledge_cls = [i['entity'] for i in output]
         
     | 
| 142 | 
         
            -
             
     | 
| 143 | 
         
            -
                return skill_cls, knowledge_cls, tokens
         
     | 
| 144 | 
         
            -
                
         
     | 
| 145 | 
         
            -
             
     | 
| 146 | 
         
            -
             
     | 
| 147 | 
         
            -
             
     | 
| 148 | 
         
            -
            def tag_posting(path, llm_extract = True):
         
     | 
| 149 | 
         | 
| 150 | 
         
             
                # Reading & sentence tokenization
         
     | 
| 151 | 
         
            -
                sents = tokenize_to_sent( 
     | 
| 152 | 
         
            -
             
     | 
| 153 | 
         
            -
                for sent in sents:
         
     | 
| 154 | 
         
            -
                    # print(f"Sent: {sent}")
         
     | 
| 155 | 
         
            -
                    skill_cls, knowledge_cls, tokens = convert(sent)
         
     | 
| 156 | 
         
            -
                       
         
     | 
| 157 | 
         
            -
             
     | 
| 158 | 
         
            -
                # Pre-trained
         
     | 
| 159 | 
         
            -
                # skill_cls, knowledge_cls, _ = convert(text)
         
     | 
| 160 | 
         
            -
             
     | 
| 161 | 
         
            -
                if llm_extract:
         
     | 
| 162 | 
         
            -
             
     | 
| 163 | 
         
            -
                    # LLM-based tag extraction
         
     | 
| 164 | 
         
            -
                    tokens, output = extract_tags(text, tokenize=True)
         
     | 
| 165 | 
         
            -
                    table = zip(tokens, output['skill_labels'], output['knowledge_labels'], skill_cls, knowledge_cls)
         
     | 
| 166 | 
         
            -
                    headers = ["Token", "Skill Label", "Knowledge Label", "Pred Skill Label", "Pred Knowledge Label"]
         
     | 
| 167 | 
         
            -
                    print(tabulate(table, headers=headers, tablefmt="pretty"))
         
     | 
| 168 | 
         
            -
             
     | 
| 169 | 
         
            -
                else:
         
     | 
| 170 | 
         
            -
             
     | 
| 171 | 
         
            -
                    # Only pre-trained
         
     | 
| 172 | 
         
            -
                    table = zip(tokens, output['skill_labels'], output['knowledge_labels'])
         
     | 
| 173 | 
         
            -
                    headers = ["Token", "Skill Label", "Knowledge Label"]
         
     | 
| 174 | 
         
            -
                    print(tabulate(table, headers=headers, tablefmt="pretty"))
         
     | 
| 175 | 
         | 
| 
         | 
|
| 
         | 
|
| 176 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 177 | 
         | 
| 178 | 
         
             
            if __name__ == "__main__":
         
     | 
| 179 | 
         | 
| 180 | 
         
            -
                 
     | 
| 181 | 
         
            -
                 
     | 
| 182 | 
         
            -
             
     | 
| 183 | 
         
            -
                quit()
         
     | 
| 184 | 
         
            -
                text = input('Enter text: ')
         
     | 
| 185 | 
         
            -
             
     | 
| 186 | 
         
            -
                # LLM-based tag extraction
         
     | 
| 187 | 
         
            -
                tokens, output = extract_tags(text, tokenize=True)
         
     | 
| 188 | 
         
            -
             
     | 
| 189 | 
         
            -
                # Pre-trained
         
     | 
| 190 | 
         
            -
                skill_cls, knowledge_cls = convert(text)
         
     | 
| 191 | 
         
            -
             
     | 
| 192 | 
         
            -
                table = zip(tokens, output['skill_labels'], output['knowledge_labels'], skill_cls, knowledge_cls)
         
     | 
| 193 | 
         
            -
                headers = ["Token", "Skill Label", "Knowledge Label", "Pred Skill Label", "Pred Knowledge Label"]
         
     | 
| 194 | 
         
            -
                print(tabulate(table, headers=headers, tablefmt="pretty"))
         
     | 
| 
         | 
|
| 15 | 
         
             
            from tabulate import tabulate
         
     | 
| 16 | 
         
             
            import spacy
         
     | 
| 17 | 
         
             
            import re
         
     | 
| 18 | 
         
            +
            import json
         
     | 
| 19 | 
         | 
| 20 | 
         
             
            load_dotenv(".env")
         
     | 
| 
         | 
|
| 21 | 
         
             
            nlp = spacy.load("en_core_web_sm")
         
     | 
| 22 | 
         | 
| 23 | 
         
             
            def split_text_recursively(text):
         
     | 
| 
         | 
|
| 46 | 
         
             
                for line in str_list:
         
     | 
| 47 | 
         
             
                    doc = nlp(line)
         
     | 
| 48 | 
         
             
                    for sent in doc.sents:
         
     | 
| 
         | 
|
| 49 | 
         
             
                        sents.append(sent.text)
         
     | 
| 50 | 
         | 
| 51 | 
         
             
                return sents
         
     | 
| 
         | 
|
| 57 | 
         | 
| 58 | 
         
             
            class TokenTaggingResult(BaseModel):
         
     | 
| 59 | 
         
             
                tokens: List[str]
         
     | 
| 60 | 
         
            +
                tags_knowledge: List[str]
         
     | 
| 61 | 
         
            +
             
     | 
| 62 | 
         
            +
            class Results(BaseModel):
         
     | 
| 63 | 
         
            +
                results: List[TokenTaggingResult]
         
     | 
| 64 | 
         | 
| 65 | 
         | 
| 66 | 
         
             
            model = ChatOpenAI(model_name="gpt-4o", temperature=0.0, api_key=os.getenv('OPENAI_API_KEY'))
         
     | 
| 67 | 
         
             
            tokenizer = AutoTokenizer.from_pretrained("jjzha/jobbert_skill_extraction")
         
     | 
| 68 | 
         
            +
            parser = JsonOutputParser(pydantic_object=Results)
         
     | 
| 69 | 
         | 
| 70 | 
         
             
            # Definitions
         
     | 
| 71 | 
         | 
| 
         | 
|
| 82 | 
         
             
                few_shot_examples = file.read()
         
     | 
| 83 | 
         | 
| 84 | 
         
             
            prompt = PromptTemplate(
         
     | 
| 85 | 
         
            +
                template="""You are an expert in tagging tokens with knowledge labels. Use the following definitions to tag the input tokens:
         
     | 
| 
         | 
|
| 86 | 
         
             
                Knowledge definition:{knowledge_definition}
         
     | 
| 87 | 
         
             
                Use the examples below to tag the input text into relevant knowledge or skills categories.\n{few_shot_examples}\n{format_instructions}\n{input}\n""",
         
     | 
| 88 | 
         
             
                input_variables=["input"],
         
     | 
| 89 | 
         
             
                partial_variables={"format_instructions": parser.get_format_instructions(),
         
     | 
| 90 | 
         
             
                                   "few_shot_examples": few_shot_examples,
         
     | 
| 91 | 
         
            +
                                #    "skill_definition": skill_definition,
         
     | 
| 92 | 
         
             
                                   "knowledge_definition": knowledge_definition},
         
     | 
| 93 | 
         
             
            )
         
     | 
| 94 | 
         | 
| 95 | 
         
            +
            def extract_tags(text: str, tokenize = True) -> Results:
         
     | 
| 96 | 
         | 
| 97 | 
         
             
                if tokenize:
         
     | 
| 98 | 
         
            +
                    tokens = [tokenizer.tokenize(t) for t in text]
         
     | 
| 
         | 
|
| 
         | 
|
| 99 | 
         | 
| 100 | 
         
             
                prompt_and_model = prompt | model
         
     | 
| 101 | 
         
             
                output = prompt_and_model.invoke({"input": tokens})
         
     | 
| 
         | 
|
| 103 | 
         
             
                return tokens, output
         
     | 
| 104 | 
         | 
| 105 | 
         | 
| 106 | 
         
            +
            def tag_posting(job_path, output_path):
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 107 | 
         | 
| 108 | 
         
             
                # Reading & sentence tokenization
         
     | 
| 109 | 
         
            +
                sents = tokenize_to_sent(job_path)
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 110 | 
         | 
| 111 | 
         
            +
                # LLM-based tag extraction
         
     | 
| 112 | 
         
            +
                tokens, output = extract_tags(sents, tokenize=True)
         
     | 
| 113 | 
         | 
| 114 | 
         
            +
                with open("./data/data.jsonl", "w") as file:
         
     | 
| 115 | 
         
            +
                    for entry in output['results']:
         
     | 
| 116 | 
         
            +
                        json.dump(entry, file)
         
     | 
| 117 | 
         
            +
                        file.write("\n")
         
     | 
| 118 | 
         | 
| 119 | 
         
             
            if __name__ == "__main__":
         
     | 
| 120 | 
         | 
| 121 | 
         
            +
                job_path = './job-postings/03-01-2024/1.txt'
         
     | 
| 122 | 
         
            +
                output_path = './data/data.json'
         
     | 
| 123 | 
         
            +
                tag_posting(job_path, output_path)
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
    	
        train.py
    CHANGED
    
    | 
         @@ -2,177 +2,161 @@ from transformers import AutoTokenizer, BertForTokenClassification, TrainingArgu 
     | 
|
| 2 | 
         
             
            import torch
         
     | 
| 3 | 
         
             
            from tabulate import tabulate
         
     | 
| 4 | 
         
             
            import wandb
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 5 | 
         | 
| 6 | 
         | 
| 7 | 
         
            -
             
     | 
| 8 | 
         
            -
            model = BertForTokenClassification.from_pretrained("Robzy/jobbert_knowledge_extraction")
         
     | 
| 9 | 
         
            -
             
     | 
| 10 | 
         
            -
            artifact = wandb.Artifact(name="jobbert-knowledge-extraction", type="BERT")
         
     | 
| 11 | 
         
            -
             
     | 
| 12 | 
         
            -
            text = 'Experience with Unreal and/or Unity and/or native IOS/Android 3D development and/or Web based 3D engines '
         
     | 
| 13 | 
         
            -
             
     | 
| 14 | 
         
            -
            # Tokenize  
         
     | 
| 15 | 
         
            -
            inputs = tokenizer(
         
     | 
| 16 | 
         
            -
                text, add_special_tokens=False, return_tensors="pt"
         
     | 
| 17 | 
         
            -
            )
         
     | 
| 18 | 
         
            -
             
     | 
| 19 | 
         
            -
            # Inference
         
     | 
| 20 | 
         
            -
             
     | 
| 21 | 
         
            -
            # with torch.no_grad():
         
     | 
| 22 | 
         
            -
            #     output = model(**inputs)
         
     | 
| 23 | 
         
            -
             
     | 
| 24 | 
         
            -
            # # Post-process
         
     | 
| 25 | 
         
            -
            # predicted_token_class_ids = output.logits.argmax(-1)
         
     | 
| 26 | 
         
            -
            # predicted_tokens_classes = [model.config.id2label[t.item()] for t in predicted_token_class_ids[0]]
         
     | 
| 27 | 
         
            -
            # tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'].squeeze())
         
     | 
| 28 | 
         
            -
             
     | 
| 29 | 
         
            -
            # # Display
         
     | 
| 30 | 
         
            -
            # table = zip(tokens, predicted_tokens_classes)
         
     | 
| 31 | 
         
            -
            # print(tabulate(table, headers=["Token", "Predicted Class"], tablefmt="pretty"))
         
     | 
| 32 | 
         | 
| 33 | 
         
            -
             
     | 
| 34 | 
         | 
| 35 | 
         
            -
             
     | 
| 36 | 
         
            -
             
     | 
| 37 | 
         | 
| 
         | 
|
| 
         | 
|
| 38 | 
         | 
| 39 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 40 | 
         | 
| 41 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 42 | 
         | 
| 43 | 
         
            -
             
     | 
| 44 | 
         
            -
             
     | 
| 45 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 46 | 
         | 
| 47 | 
         
            -
             
     | 
| 48 | 
         | 
| 49 | 
         
            -
            from torch.utils.data import DataLoader
         
     | 
| 50 | 
         
            -
            import torch.nn as nn
         
     | 
| 51 | 
         
            -
            from transformers import DataCollatorForTokenClassification
         
     | 
| 52 | 
         
            -
            from typing import List, Tuple
         
     | 
| 
         | 
|
| 53 | 
         | 
| 54 | 
         
            -
             
     | 
| 55 | 
         
            -
                 
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 56 | 
         | 
| 57 | 
         
            -
                 
     | 
| 58 | 
         
            -
                padded_lists = [lst + [pad_value] * (max_len - len(lst)) for lst in list_of_lists]
         
     | 
| 59 | 
         
            -
                attention_masks = [[1] * len(lst) + [0] * (max_len - len(lst)) for lst in list_of_lists]
         
     | 
| 60 | 
         
            -
                
         
     | 
| 61 | 
         
            -
                return torch.tensor(padded_lists), torch.tensor(attention_masks)
         
     | 
| 62 | 
         | 
| 
         | 
|
| 63 | 
         | 
| 64 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 65 | 
         | 
| 66 | 
         
            -
                input_ids, attention_mask = pad(list(map(lambda x: tokenizer.convert_tokens_to_ids(x['tokens']),batch)))
         
     | 
| 67 | 
         
            -
                tags_knowledge, _ = pad([list(map(lambda x: label2id[x],o)) for o in [b['tags_knowledge'] for b in batch]])
         
     | 
| 68 | 
         
            -
                return {"input_ids": input_ids, "tags_knowledge": tags_knowledge, "attention_mask": attention_mask}
         
     | 
| 69 | 
         | 
| 70 | 
         
            -
             
     | 
| 71 | 
         
            -
            batch_size = 32
         
     | 
| 72 | 
         
            -
            train_dataloader = DataLoader(dataset['train'], shuffle=True, batch_size=batch_size, collate_fn=collate_fn)
         
     | 
| 73 | 
         
            -
            eval_dataloader = DataLoader(dataset['train'], batch_size=batch_size, collate_fn=collate_fn)
         
     | 
| 74 | 
         | 
| 75 | 
         
            -
             
     | 
| 76 | 
         
            -
             
     | 
| 77 | 
         
            -
             
     | 
| 78 | 
         | 
| 79 | 
         
            -
            model.train()
         
     | 
| 80 | 
         
            -
            device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
         
     | 
| 81 | 
         | 
| 82 | 
         
            -
             
     | 
| 83 | 
         
            -
             
     | 
| 84 | 
         
            -
            id2label = model.config.id2label
         
     | 
| 85 | 
         
            -
            label2id = model.config.label2id
         
     | 
| 86 | 
         | 
| 87 | 
         
            -
             
     | 
| 88 | 
         
            -
             
     | 
| 
         | 
|
| 89 | 
         | 
| 90 | 
         
            -
             
     | 
| 91 | 
         
            -
             
     | 
| 92 | 
         
            -
            lr_scheduler = get_scheduler(
         
     | 
| 93 | 
         
            -
                name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
         
     | 
| 94 | 
         
            -
            )
         
     | 
| 95 | 
         | 
| 96 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 97 | 
         | 
| 98 | 
         
            -
             
     | 
| 99 | 
         | 
| 100 | 
         
            -
             
     | 
| 101 | 
         
            -
             
     | 
| 102 | 
         
            -
             
     | 
| 
         | 
|
| 103 | 
         | 
| 104 | 
         
            -
             
     | 
| 105 | 
         
            -
            current_time = datetime.now()
         
     | 
| 106 | 
         | 
| 107 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 108 | 
         | 
| 109 | 
         
            -
             
     | 
| 110 | 
         
            -
                 
     | 
| 111 | 
         
            -
             
     | 
| 
         | 
|
| 112 | 
         | 
| 113 | 
         
            -
             
     | 
| 114 | 
         
            -
                config={
         
     | 
| 115 | 
         
            -
                "learning_rate": lr,
         
     | 
| 116 | 
         
            -
                "architecture": "BERT",
         
     | 
| 117 | 
         
            -
                "epochs": num_epochs,
         
     | 
| 118 | 
         
            -
                "batch_size": batch_size,
         
     | 
| 119 | 
         
            -
                "notes": "Datetime: " + current_time.strftime("%m/%d/%Y, %H:%M:%S")
         
     | 
| 120 | 
         
            -
                }
         
     | 
| 121 | 
         
            -
            )
         
     | 
| 122 | 
         | 
| 123 | 
         
            -
             
     | 
| 124 | 
         
            -
            from datetime import datetime
         
     | 
| 125 | 
         
            -
            logging.info("Initiating training")
         
     | 
| 126 | 
         | 
| 127 | 
         
            -
             
     | 
| 128 | 
         
            -
             
     | 
| 129 | 
         
            -
                logging.info(f"Epoch #{epoch}")
         
     | 
| 130 | 
         
            -
                print(f"Epoch #{epoch}")
         
     | 
| 131 | 
         | 
| 132 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 133 | 
         | 
| 134 | 
         
            -
             
     | 
| 135 | 
         | 
| 136 | 
         
            -
             
     | 
| 137 | 
         
            -
             
     | 
| 
         | 
|
| 138 | 
         | 
| 139 | 
         
            -
             
     | 
| 140 | 
         
            -
             
     | 
| 141 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 142 | 
         | 
| 143 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 144 | 
         | 
| 145 | 
         
            -
             
     | 
| 146 | 
         
            -
                    pred = outputs.logits.reshape(-1, model.config.num_labels) # Logits
         
     | 
| 147 | 
         
            -
                    label = torch.where(attention_mask==0, torch.tensor(IGNORE_INDEX).to(device), tags_knowledge).reshape(-1) # Labels, padding set to class idx -100
         
     | 
| 148 | 
         | 
| 149 | 
         
            -
                     
     | 
| 150 | 
         
            -
                    _, predicted_labels = torch.max(pred, dim=1)
         
     | 
| 151 | 
         
            -
                    non_pad_elements = label != IGNORE_INDEX
         
     | 
| 152 | 
         
            -
                    correct_predictions = (predicted_labels[non_pad_elements] == label[non_pad_elements]).sum().item()
         
     | 
| 153 | 
         
            -
                    total_predictions = non_pad_elements.sum().item()
         
     | 
| 154 | 
         
            -
                    accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
         
     | 
| 155 | 
         | 
| 156 | 
         
            -
             
     | 
| 157 | 
         
            -
                    loss.backward()
         
     | 
| 158 | 
         
            -
                    optimizer.step()
         
     | 
| 159 | 
         
            -
                    lr_scheduler.step()
         
     | 
| 160 | 
         
            -
                    optimizer.zero_grad()
         
     | 
| 161 | 
         
            -
                    
         
     | 
| 162 | 
         
            -
                    wandb.log({"epoch": epoch, "accuracy": accuracy, "loss": loss})
         
     | 
| 163 | 
         | 
| 164 | 
         
            -
                    batch_count += 1
         
     | 
| 165 | 
         | 
| 166 | 
         
            -
                 
     | 
| 167 | 
         | 
| 168 | 
         | 
| 169 | 
         
            -
             
     | 
| 
         | 
|
| 170 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 171 | 
         | 
| 172 | 
         
            -
            #  
     | 
| 173 | 
         
            -
             
     | 
| 174 | 
         
            -
            with artifact.new_file('model.pth', mode='wb') as f:
         
     | 
| 175 | 
         
            -
                torch.save(state_dict, f)
         
     | 
| 176 | 
         | 
| 177 | 
         
            -
             
     | 
| 178 | 
         
            -
             
     | 
| 
         | 
| 
         | 
|
| 2 | 
         
             
            import torch
         
     | 
| 3 | 
         
             
            from tabulate import tabulate
         
     | 
| 4 | 
         
             
            import wandb
         
     | 
| 5 | 
         
            +
            import os
         
     | 
| 6 | 
         
            +
            import yaml
         
     | 
| 7 | 
         
            +
            from datetime import datetime
         
     | 
| 8 | 
         | 
| 9 | 
         | 
| 10 | 
         
            +
            def train(json_path: str):
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 11 | 
         | 
| 12 | 
         
            +
                ### Model & tokenizer loading
         
     | 
| 13 | 
         | 
| 14 | 
         
            +
                tokenizer = AutoTokenizer.from_pretrained("jjzha/jobbert_knowledge_extraction")
         
     | 
| 15 | 
         
            +
                model = BertForTokenClassification.from_pretrained("Robzy/jobbert_knowledge_extraction")
         
     | 
| 16 | 
         | 
| 17 | 
         
            +
                with open("./config.yaml", "r") as file:
         
     | 
| 18 | 
         
            +
                    config = yaml.safe_load(file)
         
     | 
| 19 | 
         | 
| 20 | 
         
            +
                num_epochs = config['training']['epochs']
         
     | 
| 21 | 
         
            +
                batch_size = config['training']['batch_size']
         
     | 
| 22 | 
         
            +
                lr = config['training']['learning_rate']
         
     | 
| 23 | 
         
            +
                current_time = datetime.now()
         
     | 
| 24 | 
         | 
| 25 | 
         
            +
                run = wandb.init(
         
     | 
| 26 | 
         
            +
                    # set the wandb project where this run will be logged
         
     | 
| 27 | 
         
            +
                    project="in-demand",
         
     | 
| 28 | 
         | 
| 29 | 
         
            +
                    # track hyperparameters and run metadata
         
     | 
| 30 | 
         
            +
                    config={
         
     | 
| 31 | 
         
            +
                    "learning_rate": lr,
         
     | 
| 32 | 
         
            +
                    "architecture": "BERT",
         
     | 
| 33 | 
         
            +
                    "epochs": num_epochs,
         
     | 
| 34 | 
         
            +
                    "batch_size": batch_size,
         
     | 
| 35 | 
         
            +
                    "notes": "Datetime: " + current_time.strftime("%m/%d/%Y, %H:%M:%S")
         
     | 
| 36 | 
         
            +
                    }
         
     | 
| 37 | 
         
            +
                )
         
     | 
| 38 | 
         | 
| 39 | 
         
            +
                ### Data loading and preprocessing
         
     | 
| 40 | 
         | 
| 41 | 
         
            +
                from torch.utils.data import DataLoader
         
     | 
| 42 | 
         
            +
                import torch.nn as nn
         
     | 
| 43 | 
         
            +
                from transformers import DataCollatorForTokenClassification
         
     | 
| 44 | 
         
            +
                from typing import List, Tuple
         
     | 
| 45 | 
         
            +
                from datasets import load_dataset
         
     | 
| 46 | 
         | 
| 47 | 
         
            +
                # dataset = load_dataset("json", data_files="data/test-short.json")
         
     | 
| 48 | 
         
            +
                dataset = load_dataset("json", data_files=json_path)
         
     | 
| 49 | 
         
            +
                dataset = dataset.map(
         
     | 
| 50 | 
         
            +
                    lambda x: {"input_ids": torch.tensor(tokenizer.convert_tokens_to_ids(x["tokens"]))}
         
     | 
| 51 | 
         
            +
                )
         
     | 
| 52 | 
         | 
| 53 | 
         
            +
                def pad(list_of_lists, pad_value=0):
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 54 | 
         | 
| 55 | 
         
            +
                    max_len = max(len(lst) for lst in list_of_lists)
         
     | 
| 56 | 
         | 
| 57 | 
         
            +
                    # Pad shorter lists with the specified value
         
     | 
| 58 | 
         
            +
                    padded_lists = [lst + [pad_value] * (max_len - len(lst)) for lst in list_of_lists]
         
     | 
| 59 | 
         
            +
                    attention_masks = [[1] * len(lst) + [0] * (max_len - len(lst)) for lst in list_of_lists]
         
     | 
| 60 | 
         
            +
                    
         
     | 
| 61 | 
         
            +
                    return torch.tensor(padded_lists), torch.tensor(attention_masks)
         
     | 
| 62 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 63 | 
         | 
| 64 | 
         
            +
                def collate_fn(batch: List[List[torch.Tensor]]):
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 65 | 
         | 
| 66 | 
         
            +
                    input_ids, attention_mask = pad(list(map(lambda x: tokenizer.convert_tokens_to_ids(x['tokens']),batch)))
         
     | 
| 67 | 
         
            +
                    tags_knowledge, _ = pad([list(map(lambda x: label2id[x],o)) for o in [b['tags_knowledge'] for b in batch]])
         
     | 
| 68 | 
         
            +
                    return {"input_ids": input_ids, "tags_knowledge": tags_knowledge, "attention_mask": attention_mask}
         
     | 
| 69 | 
         | 
| 
         | 
|
| 
         | 
|
| 70 | 
         | 
| 71 | 
         
            +
                ###  Training settings
         
     | 
| 72 | 
         
            +
                train_dataloader = DataLoader(dataset['train'], batch_size=batch_size, collate_fn=collate_fn)
         
     | 
| 
         | 
|
| 
         | 
|
| 73 | 
         | 
| 74 | 
         
            +
                from tqdm.auto import tqdm
         
     | 
| 75 | 
         
            +
                from torch.optim import AdamW
         
     | 
| 76 | 
         
            +
                from transformers import get_scheduler
         
     | 
| 77 | 
         | 
| 78 | 
         
            +
                model.train()
         
     | 
| 79 | 
         
            +
                device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 80 | 
         | 
| 81 | 
         
            +
                IGNORE_INDEX = -100
         
     | 
| 82 | 
         
            +
                criterion = nn.CrossEntropyLoss(ignore_index=IGNORE_INDEX)
         
     | 
| 83 | 
         
            +
                id2label = model.config.id2label
         
     | 
| 84 | 
         
            +
                label2id = model.config.label2id
         
     | 
| 85 | 
         | 
| 86 | 
         
            +
                optimizer = AdamW(model.parameters(), lr=lr)
         
     | 
| 87 | 
         | 
| 88 | 
         
            +
                num_training_steps = num_epochs * len(train_dataloader)
         
     | 
| 89 | 
         
            +
                lr_scheduler = get_scheduler(
         
     | 
| 90 | 
         
            +
                    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
         
     | 
| 91 | 
         
            +
                )
         
     | 
| 92 | 
         | 
| 93 | 
         
            +
                ### Training
         
     | 
| 
         | 
|
| 94 | 
         | 
| 95 | 
         
            +
                from dotenv import load_dotenv
         
     | 
| 96 | 
         
            +
                import os
         
     | 
| 97 | 
         
            +
                load_dotenv(".env")
         
     | 
| 98 | 
         
            +
                import logging
         
     | 
| 99 | 
         
            +
                logging.info("Initiating training")
         
     | 
| 100 | 
         | 
| 101 | 
         
            +
                progress_bar = tqdm(range(num_epochs), desc="Epochs")
         
     | 
| 102 | 
         
            +
                for epoch in range(num_epochs):
         
     | 
| 103 | 
         
            +
                    logging.info(f"Epoch #{epoch}")
         
     | 
| 104 | 
         
            +
                    # print(f"Epoch #{epoch}")
         
     | 
| 105 | 
         | 
| 106 | 
         
            +
                    batch_count = 1
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 107 | 
         | 
| 108 | 
         
            +
                    for batch in train_dataloader:
         
     | 
| 
         | 
|
| 
         | 
|
| 109 | 
         | 
| 110 | 
         
            +
                        logging.info(f"Batch #{batch_count} / {len(train_dataloader)}")
         
     | 
| 111 | 
         
            +
                        # print(f"Batch #{batch_count} / {len(train_dataloader)}")
         
     | 
| 
         | 
|
| 
         | 
|
| 112 | 
         | 
| 113 | 
         
            +
                        tokens = batch['input_ids'].to(device)
         
     | 
| 114 | 
         
            +
                        attention_mask = batch['attention_mask'].to(device)
         
     | 
| 115 | 
         
            +
                        tags_knowledge = batch['tags_knowledge'].to(device)
         
     | 
| 116 | 
         | 
| 117 | 
         
            +
                        outputs = model(tokens, attention_mask=attention_mask)
         
     | 
| 118 | 
         | 
| 119 | 
         
            +
                        # Batch
         
     | 
| 120 | 
         
            +
                        pred = outputs.logits.reshape(-1, model.config.num_labels) # Logits
         
     | 
| 121 | 
         
            +
                        label = torch.where(attention_mask==0, torch.tensor(IGNORE_INDEX).to(device), tags_knowledge).reshape(-1) # Labels, padding set to class idx -100
         
     | 
| 122 | 
         | 
| 123 | 
         
            +
                        # Compute accuracy ignoring padding idx
         
     | 
| 124 | 
         
            +
                        _, predicted_labels = torch.max(pred, dim=1)
         
     | 
| 125 | 
         
            +
                        non_pad_elements = label != IGNORE_INDEX
         
     | 
| 126 | 
         
            +
                        correct_predictions = (predicted_labels[non_pad_elements] == label[non_pad_elements]).sum().item()
         
     | 
| 127 | 
         
            +
                        total_predictions = non_pad_elements.sum().item()
         
     | 
| 128 | 
         
            +
                        accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
         
     | 
| 129 | 
         | 
| 130 | 
         
            +
                        loss = criterion(pred, label)
         
     | 
| 131 | 
         
            +
                        loss.backward()
         
     | 
| 132 | 
         
            +
                        optimizer.step()
         
     | 
| 133 | 
         
            +
                        lr_scheduler.step()
         
     | 
| 134 | 
         
            +
                        optimizer.zero_grad()
         
     | 
| 135 | 
         
            +
                        
         
     | 
| 136 | 
         
            +
                        wandb.log({"epoch": epoch, "accuracy": accuracy, "loss": loss})
         
     | 
| 137 | 
         | 
| 138 | 
         
            +
                        batch_count += 1
         
     | 
| 
         | 
|
| 
         | 
|
| 139 | 
         | 
| 140 | 
         
            +
                    progress_bar.update(1)
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 141 | 
         | 
| 142 | 
         
            +
                print("Training complete")
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 143 | 
         | 
| 
         | 
|
| 144 | 
         | 
| 145 | 
         
            +
                ### Pushing model
         
     | 
| 146 | 
         | 
| 147 | 
         | 
| 148 | 
         
            +
                # Hugging Face
         
     | 
| 149 | 
         
            +
                model.push_to_hub("Robzy/jobbert_knowledge_extraction")
         
     | 
| 150 | 
         | 
| 151 | 
         
            +
                # W&B
         
     | 
| 152 | 
         
            +
                artifact = wandb.Artifact(name="jobbert-knowledge-extraction", type="BERT")
         
     | 
| 153 | 
         
            +
                state_dict = model.state_dict()
         
     | 
| 154 | 
         
            +
                with artifact.new_file('model.pth', mode='wb') as f:
         
     | 
| 155 | 
         
            +
                    torch.save(state_dict, f)
         
     | 
| 156 | 
         | 
| 157 | 
         
            +
                # Log the artifact to W&B
         
     | 
| 158 | 
         
            +
                wandb.log_artifact(artifact)
         
     | 
| 
         | 
|
| 
         | 
|
| 159 | 
         | 
| 160 | 
         
            +
            if __name__ == "__main__":
         
     | 
| 161 | 
         
            +
                
         
     | 
| 162 | 
         
            +
                train(json_path="./data/data.jsonl")
         
     |