import re from typing import Dict, List, Optional from transformers import pipeline # Instantiate your pipelines just once zero_shot = pipeline("zero-shot-classification") sentiment = pipeline("sentiment-analysis") def extract_age(text: str) -> Optional[int]: age_pattern = r'\b(\d{1,2})\s*-?\s*years?\s*-?\s*old\b|\b(\d{1,2})\b' matches = re.findall(age_pattern, text) if matches: age = next(int(num) for nums in matches for num in nums if num) return age if 0 < age < 120 else None return None def extract_gender(text: str) -> str: text_lower = text.lower() gender_indicators = { 'male': ['he', 'him', 'his', 'brother', 'boyfriend', 'husband', 'son', 'dad', 'father'], 'female': ['she', 'her', 'hers', 'sister', 'girlfriend', 'wife', 'daughter', 'mom', 'mother'] } for gender, indicators in gender_indicators.items(): if any(f" {indicator} " in f" {text_lower} " for indicator in indicators): return gender return "unknown" def extract_interests(text: str, categories: List[str]) -> List[Dict]: """ Extracts all interests after verbs like "love(s)", "like(s)", or "enjoy(s)" until we hit another recognized verb or the end of the text. Then splits on "and"/commas as standalone words, preserving original casing (so "painting" is recognized properly). Example: "She loves painting and enjoys traveling" -> ["painting", "traveling"] "She loves art and music" -> ["art", "music"] """ import re from transformers import pipeline # Fresh pipelines each call (or you can move these outside) zero_shot = pipeline("zero-shot-classification") sentiment = pipeline("sentiment-analysis") # Tokenize by any non-whitespace tokens = re.findall(r"\S+", text) n = len(tokens) # Recognized verbs (compare lowercased) verb_set = {"love", "loves", "like", "likes", "enjoy", "enjoys"} interests_list = [] seen = set() i = 0 while i < n: word_lower = tokens[i].lower() if word_lower in verb_set: # Collect subsequent tokens until next verb or end j = i + 1 while j < n and tokens[j].lower() not in verb_set: j += 1 # Now tokens i+1..j-1 form the chunk chunk_tokens = tokens[i+1 : j] if chunk_tokens: # e.g. ["painting", "and"] chunk_str = " ".join(chunk_tokens) # Key fix: split on standalone "and" or commas, ignoring case sub_parts = re.split(r'\s*,\s*|\s*\band\b\s*', chunk_str, flags=re.IGNORECASE) for candidate in sub_parts: candidate = candidate.strip() if candidate and candidate not in seen: seen.add(candidate) # Zero-shot + sentiment z_result = zero_shot(candidate, categories, multi_label=False) s_result = sentiment(candidate)[0] interests_list.append({ 'phrase': candidate, # preserve original 'category': z_result['labels'][0], 'confidence': z_result['scores'][0], 'sentiment': s_result['label'], 'sentiment_score': s_result['score'] }) i = j # skip forward else: i += 1 return interests_list def extract_dislikes(text: str) -> List[str]: text_lower = text.lower() dislike_pattern = r'(?:hates|dislikes|(?:doesn\'t|does\s+not)\s+like)\s+([^,.]+?)(?=\s+and\s+|$|,)' matches = re.findall(dislike_pattern, text_lower) dislikes = [] for match in matches: parts = re.split(r'(?:,\s*|\s+and\s+)', match) for p in parts: cleaned = p.replace("doesn't like ", "").replace("does not like ", "").strip() if cleaned: dislikes.append(cleaned) return dislikes def format_profile(profile: Dict) -> str: output = [] output.append("Profile Summary:") output.append(f"- Age: {profile['age'] or 'Unknown'}") output.append(f"- Gender: {profile['gender'].title()}") output.append("- Interests: " + ", ".join(i['phrase'] for i in profile['interests'])) if profile['dislikes']: output.append("- Dislikes: " + ", ".join(profile['dislikes'])) return "\n".join(output)