Spaces:
Paused
Paused
| import re | |
| from typing import Dict, List, Optional | |
| from transformers import pipeline | |
| # Instantiate your pipelines just once | |
| zero_shot = pipeline("zero-shot-classification") | |
| sentiment = pipeline("sentiment-analysis") | |
| def extract_age(text: str) -> Optional[int]: | |
| age_pattern = r'\b(\d{1,2})\s*-?\s*years?\s*-?\s*old\b|\b(\d{1,2})\b' | |
| matches = re.findall(age_pattern, text) | |
| if matches: | |
| age = next(int(num) for nums in matches for num in nums if num) | |
| return age if 0 < age < 120 else None | |
| return None | |
| def extract_gender(text: str) -> str: | |
| text_lower = text.lower() | |
| gender_indicators = { | |
| 'male': ['he', 'him', 'his', 'brother', 'boyfriend', 'husband', 'son', 'dad', 'father'], | |
| 'female': ['she', 'her', 'hers', 'sister', 'girlfriend', 'wife', 'daughter', 'mom', 'mother'] | |
| } | |
| for gender, indicators in gender_indicators.items(): | |
| if any(f" {indicator} " in f" {text_lower} " for indicator in indicators): | |
| return gender | |
| return "unknown" | |
| def extract_interests(text: str, categories: List[str]) -> List[Dict]: | |
| """ | |
| Extracts all interests after verbs like "love(s)", "like(s)", or "enjoy(s)" until we hit | |
| another recognized verb or the end of the text. Then splits on "and"/commas as standalone words, | |
| preserving original casing (so "painting" is recognized properly). | |
| Example: | |
| "She loves painting and enjoys traveling" -> ["painting", "traveling"] | |
| "She loves art and music" -> ["art", "music"] | |
| """ | |
| import re | |
| from transformers import pipeline | |
| # Fresh pipelines each call (or you can move these outside) | |
| zero_shot = pipeline("zero-shot-classification") | |
| sentiment = pipeline("sentiment-analysis") | |
| # Tokenize by any non-whitespace | |
| tokens = re.findall(r"\S+", text) | |
| n = len(tokens) | |
| # Recognized verbs (compare lowercased) | |
| verb_set = {"love", "loves", "like", "likes", "enjoy", "enjoys"} | |
| interests_list = [] | |
| seen = set() | |
| i = 0 | |
| while i < n: | |
| word_lower = tokens[i].lower() | |
| if word_lower in verb_set: | |
| # Collect subsequent tokens until next verb or end | |
| j = i + 1 | |
| while j < n and tokens[j].lower() not in verb_set: | |
| j += 1 | |
| # Now tokens i+1..j-1 form the chunk | |
| chunk_tokens = tokens[i+1 : j] | |
| if chunk_tokens: | |
| # e.g. ["painting", "and"] | |
| chunk_str = " ".join(chunk_tokens) | |
| # Key fix: split on standalone "and" or commas, ignoring case | |
| sub_parts = re.split(r'\s*,\s*|\s*\band\b\s*', chunk_str, flags=re.IGNORECASE) | |
| for candidate in sub_parts: | |
| candidate = candidate.strip() | |
| if candidate and candidate not in seen: | |
| seen.add(candidate) | |
| # Zero-shot + sentiment | |
| z_result = zero_shot(candidate, categories, multi_label=False) | |
| s_result = sentiment(candidate)[0] | |
| interests_list.append({ | |
| 'phrase': candidate, # preserve original | |
| 'category': z_result['labels'][0], | |
| 'confidence': z_result['scores'][0], | |
| 'sentiment': s_result['label'], | |
| 'sentiment_score': s_result['score'] | |
| }) | |
| i = j # skip forward | |
| else: | |
| i += 1 | |
| return interests_list | |
| def extract_dislikes(text: str) -> List[str]: | |
| text_lower = text.lower() | |
| dislike_pattern = r'(?:hates|dislikes|(?:doesn\'t|does\s+not)\s+like)\s+([^,.]+?)(?=\s+and\s+|$|,)' | |
| matches = re.findall(dislike_pattern, text_lower) | |
| dislikes = [] | |
| for match in matches: | |
| parts = re.split(r'(?:,\s*|\s+and\s+)', match) | |
| for p in parts: | |
| cleaned = p.replace("doesn't like ", "").replace("does not like ", "").strip() | |
| if cleaned: | |
| dislikes.append(cleaned) | |
| return dislikes | |
| def format_profile(profile: Dict) -> str: | |
| output = [] | |
| output.append("Profile Summary:") | |
| output.append(f"- Age: {profile['age'] or 'Unknown'}") | |
| output.append(f"- Gender: {profile['gender'].title()}") | |
| output.append("- Interests: " + ", ".join(i['phrase'] for i in profile['interests'])) | |
| if profile['dislikes']: | |
| output.append("- Dislikes: " + ", ".join(profile['dislikes'])) | |
| return "\n".join(output) | |