Spaces:

noddysnots
/

Gift-Recommender

Paused

App Files Files Community

Gift-Recommender / src /utils /text_processors.py

noddysnots

Upload 8 files

58d6553 verified 10 months ago

raw

history blame contribute delete

4.77 kB

	import re
	from typing import Dict, List, Optional

	from transformers import pipeline

	# Instantiate your pipelines just once
	zero_shot = pipeline("zero-shot-classification")
	sentiment = pipeline("sentiment-analysis")

	def extract_age(text: str) -> Optional[int]:
	age_pattern = r'\b(\d{1,2})\s-?\syears?\s-?\sold\b\|\b(\d{1,2})\b'
	matches = re.findall(age_pattern, text)
	if matches:
	age = next(int(num) for nums in matches for num in nums if num)
	return age if 0 < age < 120 else None
	return None

	def extract_gender(text: str) -> str:
	text_lower = text.lower()
	gender_indicators = {
	'male': ['he', 'him', 'his', 'brother', 'boyfriend', 'husband', 'son', 'dad', 'father'],
	'female': ['she', 'her', 'hers', 'sister', 'girlfriend', 'wife', 'daughter', 'mom', 'mother']
	}

	for gender, indicators in gender_indicators.items():
	if any(f" {indicator} " in f" {text_lower} " for indicator in indicators):
	return gender
	return "unknown"

	def extract_interests(text: str, categories: List[str]) -> List[Dict]:
	"""
	Extracts all interests after verbs like "love(s)", "like(s)", or "enjoy(s)" until we hit
	another recognized verb or the end of the text. Then splits on "and"/commas as standalone words,
	preserving original casing (so "painting" is recognized properly).

	Example:
	"She loves painting and enjoys traveling" -> ["painting", "traveling"]
	"She loves art and music" -> ["art", "music"]
	"""
	import re
	from transformers import pipeline

	# Fresh pipelines each call (or you can move these outside)
	zero_shot = pipeline("zero-shot-classification")
	sentiment = pipeline("sentiment-analysis")

	# Tokenize by any non-whitespace
	tokens = re.findall(r"\S+", text)
	n = len(tokens)

	# Recognized verbs (compare lowercased)
	verb_set = {"love", "loves", "like", "likes", "enjoy", "enjoys"}

	interests_list = []
	seen = set()

	i = 0
	while i < n:
	word_lower = tokens[i].lower()

	if word_lower in verb_set:
	# Collect subsequent tokens until next verb or end
	j = i + 1
	while j < n and tokens[j].lower() not in verb_set:
	j += 1

	# Now tokens i+1..j-1 form the chunk
	chunk_tokens = tokens[i+1 : j]
	if chunk_tokens:
	# e.g. ["painting", "and"]
	chunk_str = " ".join(chunk_tokens)

	# Key fix: split on standalone "and" or commas, ignoring case
	sub_parts = re.split(r'\s,\s\|\s\band\b\s', chunk_str, flags=re.IGNORECASE)

	for candidate in sub_parts:
	candidate = candidate.strip()
	if candidate and candidate not in seen:
	seen.add(candidate)

	# Zero-shot + sentiment
	z_result = zero_shot(candidate, categories, multi_label=False)
	s_result = sentiment(candidate)[0]

	interests_list.append({
	'phrase': candidate, # preserve original
	'category': z_result['labels'][0],
	'confidence': z_result['scores'][0],
	'sentiment': s_result['label'],
	'sentiment_score': s_result['score']
	})

	i = j # skip forward
	else:
	i += 1

	return interests_list

	def extract_dislikes(text: str) -> List[str]:
	text_lower = text.lower()
	dislike_pattern = r'(?:hates\|dislikes\|(?:doesn\'t\|does\s+not)\s+like)\s+([^,.]+?)(?=\s+and\s+\|$\|,)'
	matches = re.findall(dislike_pattern, text_lower)

	dislikes = []
	for match in matches:
	parts = re.split(r'(?:,\s*\|\s+and\s+)', match)
	for p in parts:
	cleaned = p.replace("doesn't like ", "").replace("does not like ", "").strip()
	if cleaned:
	dislikes.append(cleaned)

	return dislikes

	def format_profile(profile: Dict) -> str:
	output = []
	output.append("Profile Summary:")
	output.append(f"- Age: {profile['age'] or 'Unknown'}")
	output.append(f"- Gender: {profile['gender'].title()}")
	output.append("- Interests: " + ", ".join(i['phrase'] for i in profile['interests']))
	if profile['dislikes']:
	output.append("- Dislikes: " + ", ".join(profile['dislikes']))
	return "\n".join(output)