File size: 5,875 Bytes
39838a2
53a648d
39838a2
53a648d
39838a2
 
 
 
 
6d79ec9
39838a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6d79ec9
39838a2
6d79ec9
53a648d
6d79ec9
53a648d
 
 
6d79ec9
 
53a648d
a619283
 
 
53a648d
 
 
a619283
39838a2
53a648d
39838a2
a619283
53a648d
 
 
 
 
 
39838a2
a619283
53a648d
 
 
 
 
 
e34d0c9
 
53a648d
39838a2
6d79ec9
39838a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6d79ec9
 
53a648d
6d79ec9
 
 
 
 
 
 
 
 
 
 
 
 
39838a2
 
53a648d
39838a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import gradio as gr
from llama_cpp import Llama
import logging
import os

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

SYSTEM_INSTRUCTION = """Convert natural language queries into boolean search queries by following these rules:

1. FIRST: Remove all meta-terms from this list (they should NEVER appear in output):
   - articles, papers, research, studies
   - examining, investigating, analyzing
   - findings, documents, literature
   - publications, journals, reviews
   Example: "Research examining X" β†’ just "X"

2. SECOND: Remove generic implied terms that don't add search value:
   - Remove words like "practices," "techniques," "methods," "approaches," "strategies"
   - Remove words like "impacts," "effects," "influences," "role," "applications"
   - For example: "sustainable agriculture practices" β†’ "sustainable agriculture"
   - For example: "teaching methodologies" β†’ "teaching"
   - For example: "leadership styles" β†’ "leadership"

3. THEN: Format the remaining terms:
   CRITICAL QUOTING RULES:
   - Multi-word phrases MUST ALWAYS be in quotes - NO EXCEPTIONS
   - Examples of correct quoting:
     - Wrong: machine learning AND deep learning
     - Right: "machine learning" AND "deep learning"
     - Wrong: natural language processing
     - Right: "natural language processing"
   - Single words must NEVER have quotes (e.g., science, research, learning)
   - Use AND to connect required concepts
   - Use OR with parentheses for alternatives"""

def load_model():
    """Load the model."""
    logger.info("Loading model...")
    model = Llama.from_pretrained(
        repo_id="Zwounds/boolean-search-model",
        filename="boolean.gguf",
    )
    logger.info("Model loaded successfully")
    return model

def extract_response(output: str) -> str:
    """Extract the response part from the output."""
    if not output:
        return ""
    # Return the generated text, trimming any system prompts
    return output.strip()

def get_boolean_query(query: str, model=None) -> str:
    """Generate boolean query from natural language."""
    # Format the conversation
    prompt = f"""<|im_start|>system
{SYSTEM_INSTRUCTION}<|im_end|>
<|im_start|>user
{query}<|im_end|>
<|im_start|>assistant
"""
    
    # Generate response
    output = model.create_completion(
        prompt,
        max_tokens=64,
        stop=["<|im_end|>"],
        echo=False,
        temperature=0.0
    )
    
    return extract_response(output['choices'][0]['text'])

# Example queries demonstrating various cases
examples = [
    # Testing removal of meta-terms
    ["Find research papers examining the long-term effects of meditation on brain structure"],
    
    # Testing removal of generic implied terms (practices, techniques, methods)
    ["Articles about deep learning techniques for natural language processing tasks"],
    
    # Testing removal of impact/effect terms
    ["Studies on the impact of early childhood nutrition on cognitive development"],
    
    # Testing handling of technology applications
    ["Information on virtual reality applications in architectural design and urban planning"],
    
    # Testing proper OR relationship with parentheses
    ["Research on electric vehicles adoption in urban environments or rural communities"],
    
    # Testing proper quoting of multi-word concepts only
    ["Articles on biodiversity loss in coral reefs and rainforest ecosystems"],
    
    # Testing removal of strategy/approach terms
    ["Studies about different teaching approaches for children with learning disabilities"],
    
    # Testing complex OR relationships
    ["Research examining social media influence on political polarization or public discourse"],
    
    # Testing implied terms in specific industries
    ["Articles about implementation strategies for blockchain in supply chain management or financial services"],
    
    # Testing qualifiers that don't add search value
    ["Research on effective leadership styles in multicultural organizations"],
    
    # Testing removal of multiple implied terms
    ["Studies on the effects of microplastic pollution techniques on marine ecosystem health"],
    
    # Testing domain-specific implied terms
    ["Articles about successful cybersecurity protection methods for critical infrastructure"],
    
    # Testing generalized vs specific concepts
    ["Research papers on quantum computing algorithms for cryptography or optimization problems"],
    
    # Testing implied terms in outcome descriptions
    ["Studies examining the relationship between sleep quality and academic performance outcomes"],
    
    # Testing complex nesting of concepts
    ["Articles about renewable energy integration challenges in developing countries or island nations"]
]

# Load model globally
logger.info("Initializing model...")
model = load_model()

# Create Gradio interface
title = "Natural Language to Boolean Search"
description = """Convert natural language queries into boolean search expressions. The model will:

1. Remove search-related terms (like 'articles', 'research', etc.)
2. Handle generic implied terms (like 'practices', 'methods')
3. Format concepts using proper boolean syntax:
   - Multi-word phrases in quotes
   - Single words without quotes
   - AND to connect required concepts
   - OR with parentheses for alternatives
"""

demo = gr.Interface(
    fn=lambda x: get_boolean_query(x, model),
    inputs=[
        gr.Textbox(
            label="Enter your natural language query",
            placeholder="e.g., I'm looking for information about climate change and renewable energy"
        )
    ],
    outputs=gr.Textbox(label="Boolean Search Query"),
    title=title,
    description=description,
    examples=examples,
    theme=gr.themes.Soft()
)

if __name__ == "__main__":
    demo.launch()