Zwounds's picture
Upload folder using huggingface_hub
a619283 verified
raw
history blame
6.62 kB
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import logging
# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
SYSTEM_INSTRUCTION = """Convert natural language queries into boolean search queries by following these rules:
1. FIRST: Remove all meta-terms from this list (they should NEVER appear in output):
- articles, papers, research, studies
- examining, investigating, analyzing
- findings, documents, literature
- publications, journals, reviews
Example: "Research examining X" β†’ just "X"
2. SECOND: Remove generic implied terms that don't add search value:
- Remove words like "practices," "techniques," "methods," "approaches," "strategies"
- Remove words like "impacts," "effects," "influences," "role," "applications"
- For example: "sustainable agriculture practices" β†’ "sustainable agriculture"
- For example: "teaching methodologies" β†’ "teaching"
- For example: "leadership styles" β†’ "leadership"
3. THEN: Format the remaining terms:
CRITICAL QUOTING RULES:
- Multi-word phrases MUST ALWAYS be in quotes - NO EXCEPTIONS
- Examples of correct quoting:
- Wrong: machine learning AND deep learning
- Right: "machine learning" AND "deep learning"
- Wrong: natural language processing
- Right: "natural language processing"
- Single words must NEVER have quotes (e.g., science, research, learning)
- Use AND to connect required concepts
- Use OR with parentheses for alternatives"""
def load_model():
"""Load the model and set up tokenizer."""
logger.info("Loading model...")
model = AutoModelForCausalLM.from_pretrained(
"Zwounds/boolean-search-model",
torch_dtype=torch.float32
)
tokenizer = AutoTokenizer.from_pretrained("Zwounds/boolean-search-model")
tokenizer.use_default_system_prompt = False
logger.info("Model loaded successfully")
return model, tokenizer
def extract_response(output: str) -> str:
"""Extract the response part from the output."""
start_marker = "<|start_header_id|>assistant<|end_header_id|>"
end_marker = "<|eot_id|>"
start_idx = output.find(start_marker)
if start_idx != -1:
start_idx += len(start_marker)
end_idx = output.find(end_marker, start_idx)
if end_idx != -1:
return output[start_idx:end_idx].strip()
return output.strip()
def get_boolean_query(query: str, model=None, tokenizer=None) -> str:
"""Generate boolean query from natural language."""
# Format the conversation
conversation = [
{"role": "system", "content": SYSTEM_INSTRUCTION},
{"role": "user", "content": query}
]
# Format into chat template
prompt = tokenizer.apply_chat_template(conversation, tokenize=False)
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
# Generate response
outputs = model.generate(
**inputs,
max_new_tokens=64,
do_sample=False,
use_cache=True,
pad_token_id=tokenizer.pad_token_id,
eos_token_id=tokenizer.eos_token_id
)
return extract_response(tokenizer.batch_decode(outputs)[0])
# Example queries demonstrating various cases
examples = [
# Testing removal of meta-terms
["Find research papers examining the long-term effects of meditation on brain structure"],
# Testing removal of generic implied terms (practices, techniques, methods)
["Articles about deep learning techniques for natural language processing tasks"],
# Testing removal of impact/effect terms
["Studies on the impact of early childhood nutrition on cognitive development"],
# Testing handling of technology applications
["Information on virtual reality applications in architectural design and urban planning"],
# Testing proper OR relationship with parentheses
["Research on electric vehicles adoption in urban environments or rural communities"],
# Testing proper quoting of multi-word concepts only
["Articles on biodiversity loss in coral reefs and rainforest ecosystems"],
# Testing removal of strategy/approach terms
["Studies about different teaching approaches for children with learning disabilities"],
# Testing complex OR relationships
["Research examining social media influence on political polarization or public discourse"],
# Testing implied terms in specific industries
["Articles about implementation strategies for blockchain in supply chain management or financial services"],
# Testing qualifiers that don't add search value
["Research on effective leadership styles in multicultural organizations"],
# Testing removal of multiple implied terms
["Studies on the effects of microplastic pollution techniques on marine ecosystem health"],
# Testing domain-specific implied terms
["Articles about successful cybersecurity protection methods for critical infrastructure"],
# Testing generalized vs specific concepts
["Research papers on quantum computing algorithms for cryptography or optimization problems"],
# Testing implied terms in outcome descriptions
["Studies examining the relationship between sleep quality and academic performance outcomes"],
# Testing complex nesting of concepts
["Articles about renewable energy integration challenges in developing countries or island nations"]
]
# Load model globally
logger.info("Initializing model...")
model, tokenizer = load_model()
# Create Gradio interface
title = "Natural Language to Boolean Search"
description = """Convert natural language queries into boolean search expressions. The model will:
1. Remove search-related terms (like 'articles', 'research', etc.)
2. Handle generic implied terms (like 'practices', 'methods')
3. Format concepts using proper boolean syntax:
- Multi-word phrases in quotes
- Single words without quotes
- AND to connect required concepts
- OR with parentheses for alternatives
"""
demo = gr.Interface(
fn=lambda x: get_boolean_query(x, model, tokenizer),
inputs=[
gr.Textbox(
label="Enter your natural language query",
placeholder="e.g., I'm looking for information about climate change and renewable energy"
)
],
outputs=gr.Textbox(label="Boolean Search Query"),
title=title,
description=description,
examples=examples,
theme=gr.themes.Soft()
)
if __name__ == "__main__":
demo.launch()