Spaces:
Sleeping
Sleeping
| import json | |
| from llm_helper import llm | |
| from langchain_core.prompts import PromptTemplate | |
| from langchain_core.output_parsers import JsonOutputParser | |
| from langchain_core.exceptions import OutputParserException | |
| import sqlite3 | |
| #def process_posts(raw_file_path, processed_file_path=None): | |
| # with open(raw_file_path, encoding='utf-8') as file: | |
| # posts = json.load(file) | |
| # enriched_posts = [] | |
| # for post in posts: | |
| # metadata = extract_metadata(post['text_blocks']) | |
| # post_with_metadata = post | metadata | |
| # enriched_posts.append(post_with_metadata) | |
| def process_posts_for_persona(persona_name, processed_file_path=None): | |
| posts = get_posts_by_persona(persona_name) | |
| if not posts: | |
| print(f"No posts found for persona '{persona_name}'.") | |
| return | |
| enriched_posts = [] | |
| for post in posts: | |
| metadata = extract_metadata(post) | |
| post_with_metadata = {"text": post} | metadata # Combine metadata with post | |
| enriched_posts.append(post_with_metadata) | |
| if processed_file_path: | |
| with open(processed_file_path, "w", encoding="utf-8") as outfile: | |
| json.dump(enriched_posts, outfile, indent=4) | |
| return enriched_posts | |
| def get_posts_by_persona(persona_name): | |
| """Fetch all posts for a given persona.""" | |
| conn = sqlite3.connect("personas.db") | |
| cursor = conn.cursor() | |
| # Fetch persona ID | |
| cursor.execute("SELECT persona_id FROM personas WHERE name = ?", (persona_name,)) | |
| persona = cursor.fetchone() | |
| if not persona: | |
| print(f"Persona '{persona_name}' not found.") | |
| return [] | |
| persona_id = persona[0] | |
| # Fetch posts for this persona | |
| cursor.execute("SELECT text_blocks FROM posts WHERE persona_id = ?", (persona_id,)) | |
| posts = [row[0] for row in cursor.fetchall()] | |
| conn.close() | |
| return posts | |
| unified_tags = get_unified_tags(enriched_posts) | |
| for post in enriched_posts: | |
| current_tags = post['tags'] | |
| new_tags = {unified_tags[tag] for tag in current_tags} | |
| post['tags'] = list(new_tags) | |
| with open(processed_file_path, encoding='utf-8', mode="w") as outfile: | |
| json.dump(enriched_posts, outfile, indent=4) | |
| def extract_metadata(post): | |
| template = ''' | |
| You are given a LinkedIn post. You need to extract number of lines, language of the post and tags. | |
| 1. Return a valid JSON. No preamble. | |
| 2. JSON object should have exactly three keys: line_count, language and tags. | |
| 3. tags is an array of text tags. Extract maximum two tags. | |
| 4. Language should be English, Kannada and Hindi | |
| Here is the actual post on which you need to perform this task: | |
| {post} | |
| ''' | |
| pt = PromptTemplate.from_template(template) | |
| chain = pt | llm | |
| response = chain.invoke(input={"post": post}) | |
| try: | |
| json_parser = JsonOutputParser() | |
| res = json_parser.parse(response.content) | |
| except OutputParserException: | |
| raise OutputParserException("Context too big. Unable to parse jobs.") | |
| return res | |
| def get_unified_tags(posts_with_metadata): | |
| unique_tags = set() | |
| # Loop through each post and extract the tags | |
| for post in posts_with_metadata: | |
| unique_tags.update(post['tags']) # Add the tags to the set | |
| unique_tags_list = ','.join(unique_tags) | |
| template = '''I will give you a list of tags. You need to unify tags with the following requirements, | |
| 1. Tags are unified and merged to create a shorter list. | |
| Example 1: "Jobseekers", "Job Hunting" can be all merged into a single tag "Job Search". | |
| Example 2: "Motivation", "Inspiration", "Drive" can be mapped to "Motivation" | |
| Example 3: "Personal Growth", "Personal Development", "Self Improvement" can be mapped to "Self Improvement" | |
| Example 4: "Scam Alert", "Job Scam" etc. can be mapped to "Scams" | |
| Example 5: "Finance", "economics", "currency" etc., can be mapped to "Financial literacy" | |
| 2. Each tag should be follow title case convention. example: "Motivation", "Job Search" | |
| 3. Output should be a JSON object, No preamble | |
| 3. Output should have mapping of original tag and the unified tag. | |
| For example: {{"Jobseekers": "Job Search", "Job Hunting": "Job Search", "Motivation": "Motivation}} | |
| Here is the list of tags: | |
| {tags} | |
| ''' | |
| pt = PromptTemplate.from_template(template) | |
| chain = pt | llm | |
| response = chain.invoke(input={"tags": str(unique_tags_list)}) | |
| try: | |
| json_parser = JsonOutputParser() | |
| res = json_parser.parse(response.content) | |
| except OutputParserException: | |
| raise OutputParserException("Context too big. Unable to parse jobs.") | |
| return res | |
| if __name__ == "__main__": | |
| persona = input("Enter the persona name: ") | |
| process_posts_for_persona(persona, f"data/processed_{persona.lower()}_posts.json") | |
| # process_posts("data/raw_posts.json", "data/processed_posts.json") |