Zwounds commited on
Commit
39838a2
Β·
verified Β·
1 Parent(s): 928fd8e

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. README.md +2 -8
  2. demo.py +172 -0
README.md CHANGED
@@ -1,12 +1,6 @@
1
  ---
2
- title: Boolean Search Query Model
3
- emoji: 🐨
4
- colorFrom: blue
5
- colorTo: indigo
6
  sdk: gradio
7
  sdk_version: 5.21.0
8
- app_file: app.py
9
- pinned: false
10
  ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Boolean_Search_Query_Model
3
+ app_file: demo.py
 
 
4
  sdk: gradio
5
  sdk_version: 5.21.0
 
 
6
  ---
 
 
demo.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from unsloth import FastLanguageModel
4
+ import logging
5
+
6
+ # Setup logging
7
+ logging.basicConfig(level=logging.INFO)
8
+ logger = logging.getLogger(__name__)
9
+
10
+ def load_model():
11
+ """Load fine-tuned model."""
12
+ logger.info("Loading model...")
13
+ model, tokenizer = FastLanguageModel.from_pretrained(
14
+ "boolean_model_merged",
15
+ max_seq_length=2048,
16
+ dtype=None, # Auto-detect
17
+ load_in_4bit=True
18
+ )
19
+ FastLanguageModel.for_inference(model)
20
+ return model, tokenizer
21
+
22
+ def format_prompt(query):
23
+ """Format query with instruction prompt."""
24
+ return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
25
+
26
+ ### Instruction:
27
+ Convert this natural language query into a boolean search query by following these rules:
28
+
29
+ 1. FIRST: Remove all meta-terms from this list (they should NEVER appear in output):
30
+ - articles, papers, research, studies
31
+ - examining, investigating, analyzing
32
+ - findings, documents, literature
33
+ - publications, journals, reviews
34
+ Example: "Research examining X" β†’ just "X"
35
+
36
+ 2. SECOND: Remove generic implied terms that don't add search value:
37
+ - Remove words like "practices," "techniques," "methods," "approaches," "strategies"
38
+ - Remove words like "impacts," "effects," "influences," "role," "applications"
39
+ - For example: "sustainable agriculture practices" β†’ "sustainable agriculture"
40
+ - For example: "teaching methodologies" β†’ "teaching"
41
+ - For example: "leadership styles" β†’ "leadership"
42
+
43
+ 3. THEN: Format the remaining terms:
44
+ CRITICAL QUOTING RULES:
45
+ - Multi-word phrases MUST ALWAYS be in quotes - NO EXCEPTIONS
46
+ - Examples of correct quoting:
47
+ - Wrong: machine learning AND deep learning
48
+ - Right: "machine learning" AND "deep learning"
49
+ - Wrong: natural language processing
50
+ - Right: "natural language processing"
51
+ - Single words must NEVER have quotes (e.g., science, research, learning)
52
+ - Use AND to connect required concepts
53
+ - Use OR with parentheses for alternatives (e.g., ("soil health" OR biodiversity))
54
+
55
+ Example conversions showing proper quoting:
56
+ "Research on machine learning for natural language processing"
57
+ β†’ "machine learning" AND "natural language processing"
58
+
59
+ "Studies examining anxiety depression stress in workplace"
60
+ β†’ (anxiety OR depression OR stress) AND workplace
61
+
62
+ "Articles about deep learning impact on computer vision"
63
+ β†’ "deep learning" AND "computer vision"
64
+
65
+ "Research on sustainable agriculture practices and their impact on soil health or biodiversity"
66
+ β†’ "sustainable agriculture" AND ("soil health" OR biodiversity)
67
+
68
+ "Articles about effective teaching methods for second language acquisition"
69
+ β†’ teaching AND "second language acquisition"
70
+
71
+ ### Input:
72
+ {query}
73
+
74
+ ### Response:
75
+ """
76
+
77
+ def get_boolean_query(query):
78
+ """Generate boolean query from natural language."""
79
+ prompt = format_prompt(query)
80
+ device = "cuda" if torch.cuda.is_available() else "cpu"
81
+
82
+ # Tokenize and generate response
83
+ inputs = tokenizer(prompt, return_tensors="pt").to(device)
84
+ outputs = model.generate(
85
+ **inputs,
86
+ max_new_tokens=32,
87
+ do_sample=False,
88
+ use_cache=True,
89
+ eos_token_id=tokenizer.eos_token_id
90
+ )
91
+
92
+ # Extract response section and clean output
93
+ full_response = tokenizer.decode(outputs[0])
94
+ response = full_response.split("### Response:")[-1].strip()
95
+ # Remove end of text token if present
96
+ cleaned_response = response.replace("<|end_of_text|>", "").strip()
97
+ return cleaned_response
98
+
99
+ # Load model globally
100
+ logger.info("Initializing model...")
101
+ model, tokenizer = load_model()
102
+ logger.info("Model loaded successfully")
103
+
104
+ # Example queries using more natural language
105
+ examples = [
106
+ # Testing removal of meta-terms
107
+ ["Find research papers examining the long-term effects of meditation on brain structure"],
108
+
109
+ # Testing removal of generic implied terms (practices, techniques, methods)
110
+ ["Articles about deep learning techniques for natural language processing tasks"],
111
+
112
+ # Testing removal of impact/effect terms
113
+ ["Studies on the impact of early childhood nutrition on cognitive development"],
114
+
115
+ # Testing handling of technology applications
116
+ ["Information on virtual reality applications in architectural design and urban planning"],
117
+
118
+ # Testing proper OR relationship with parentheses
119
+ ["Research on electric vehicles adoption in urban environments or rural communities"],
120
+
121
+ # Testing proper quoting of multi-word concepts only
122
+ ["Articles on biodiversity loss in coral reefs and rainforest ecosystems"],
123
+
124
+ # Testing removal of strategy/approach terms
125
+ ["Studies about different teaching approaches for children with learning disabilities"],
126
+
127
+ # Testing complex OR relationships
128
+ ["Research examining social media influence on political polarization or public discourse"],
129
+
130
+ # Testing implied terms in specific industries
131
+ ["Articles about implementation strategies for blockchain in supply chain management or financial services"],
132
+
133
+ # Testing qualifiers that don't add search value
134
+ ["Research on effective leadership styles in multicultural organizations"],
135
+
136
+ # Testing removal of multiple implied terms
137
+ ["Studies on the effects of microplastic pollution techniques on marine ecosystem health"],
138
+
139
+ # Testing domain-specific implied terms
140
+ ["Articles about successful cybersecurity protection methods for critical infrastructure"],
141
+
142
+ # Testing generalized vs specific concepts
143
+ ["Research papers on quantum computing algorithms for cryptography or optimization problems"],
144
+
145
+ # Testing implied terms in outcome descriptions
146
+ ["Studies examining the relationship between sleep quality and academic performance outcomes"],
147
+
148
+ # Testing complex nesting of concepts
149
+ ["Articles about renewable energy integration challenges in developing countries or island nations"]
150
+ ]
151
+
152
+
153
+ # Create Gradio interface with metadata for deployment
154
+ title = "Boolean Search Query Generator"
155
+ description = "Convert natural language queries into boolean search expressions. The model will remove search-related terms (like 'articles', 'research', etc.), handle generic implied terms (like 'practices', 'methods'), and format the core concepts using proper boolean syntax."
156
+ demo = gr.Interface(
157
+ fn=get_boolean_query,
158
+ inputs=[
159
+ gr.Textbox(
160
+ label="Enter your natural language query",
161
+ placeholder="e.g., I'm looking for information about climate change and renewable energy"
162
+ )
163
+ ],
164
+ outputs=gr.Textbox(label="Boolean Search Query"),
165
+ title=title,
166
+ description=description,
167
+ examples=examples,
168
+ theme=gr.themes.Soft()
169
+ )
170
+
171
+ if __name__ == "__main__":
172
+ demo.launch()