bhlewis commited on
Commit
2a70269
·
verified ·
1 Parent(s): b661953

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -37
app.py CHANGED
@@ -6,6 +6,8 @@ import json
6
  from sentence_transformers import SentenceTransformer
7
  from sklearn.feature_extraction.text import TfidfVectorizer
8
  from sklearn.metrics.pairwise import cosine_similarity
 
 
9
 
10
  def load_data():
11
  try:
@@ -49,32 +51,27 @@ model = SentenceTransformer('all-mpnet-base-v2')
49
  tfidf_vectorizer = TfidfVectorizer(stop_words='english')
50
  tfidf_matrix = tfidf_vectorizer.fit_transform(texts)
51
 
52
- # Synonym dictionary for query expansion
53
- synonyms = {
54
- "slidable": ["detachable", "removable"],
55
- "flexible": ["elastic", "deformable"],
56
- "aerosol": ["vapor"],
57
- "device": ["generator"]
58
- }
 
 
59
 
60
- def expand_query(query):
61
- words = query.split()
62
- expanded_query = []
63
- for word in words:
64
- if word in synonyms:
65
- expanded_query.append(f"({word} OR {' OR '.join(synonyms[word])})")
66
- else:
67
- expanded_query.append(word)
68
- return " ".join(expanded_query)
69
 
70
  def hybrid_search(query, top_k=5):
71
  print(f"Original query: {query}")
72
 
73
- # Expand the query using synonyms
74
- expanded_query = expand_query(query)
75
- print(f"Expanded query: {expanded_query}")
76
 
77
- # Encode the original query using the transformer model
78
  query_embedding = model.encode([query])[0]
79
  query_embedding = query_embedding / np.linalg.norm(query_embedding)
80
 
@@ -82,7 +79,7 @@ def hybrid_search(query, top_k=5):
82
  semantic_distances, semantic_indices = index.search(np.array([query_embedding]), top_k * 2)
83
 
84
  # Perform TF-IDF based search
85
- query_tfidf = tfidf_vectorizer.transform([expanded_query])
86
  tfidf_similarities = cosine_similarity(query_tfidf, tfidf_matrix).flatten()
87
  tfidf_indices = tfidf_similarities.argsort()[-top_k * 2:][::-1]
88
 
@@ -90,28 +87,36 @@ def hybrid_search(query, top_k=5):
90
  combined_results = {}
91
  for i, idx in enumerate(semantic_indices[0]):
92
  patent_number = patent_numbers[idx].decode('utf-8')
93
- combined_results[patent_number] = semantic_distances[0][i] * 1.5 # Increase weight for semantic similarity
 
 
 
 
 
 
 
94
 
95
  for idx in tfidf_indices:
96
  patent_number = patent_numbers[idx].decode('utf-8')
97
- if patent_number in combined_results:
98
- combined_results[patent_number] += tfidf_similarities[idx]
99
- else:
100
- combined_results[patent_number] = tfidf_similarities[idx]
 
 
 
 
 
101
 
102
  # Sort and get top results
103
- top_results = sorted(combined_results.items(), key=lambda x: x[1], reverse=True)[:top_k]
104
 
105
  results = []
106
- for patent_number, score in top_results:
107
- if patent_number not in metadata:
108
- print(f"Warning: Patent number {patent_number} not found in metadata")
109
- continue
110
- patent_data = metadata[patent_number]
111
  result = f"Patent Number: {patent_number}\n"
112
- text = patent_data.get('text', 'No text available')
113
- result += f"Text: {text[:200]}...\n"
114
- result += f"Combined Score: {score:.4f}\n\n"
115
  results.append(result)
116
 
117
  return "\n".join(results)
@@ -119,10 +124,10 @@ def hybrid_search(query, top_k=5):
119
  # Create Gradio interface
120
  iface = gr.Interface(
121
  fn=hybrid_search,
122
- inputs=gr.Textbox(lines=2, placeholder="Enter your search query here..."),
123
  outputs=gr.Textbox(lines=10, label="Search Results"),
124
  title="Patent Similarity Search",
125
- description="Enter a query to find similar patents based on their content."
126
  )
127
 
128
  if __name__ == "__main__":
 
6
  from sentence_transformers import SentenceTransformer
7
  from sklearn.feature_extraction.text import TfidfVectorizer
8
  from sklearn.metrics.pairwise import cosine_similarity
9
+ import re
10
+ from collections import Counter
11
 
12
  def load_data():
13
  try:
 
51
  tfidf_vectorizer = TfidfVectorizer(stop_words='english')
52
  tfidf_matrix = tfidf_vectorizer.fit_transform(texts)
53
 
54
+ def extract_key_features(text):
55
+ # Extract noun phrases as potential key features
56
+ noun_phrases = re.findall(r'\b(?:[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\b', text)
57
+ # Extract phrases following "comprising", "including", "consisting of" as potential key features
58
+ feature_phrases = re.findall(r'(?:comprising|including|consisting of)\s+(.*?)(?:;|\.)', text, re.IGNORECASE)
59
+
60
+ all_features = noun_phrases + feature_phrases
61
+ # Remove duplicates and lowercase
62
+ return list(set(feature.lower() for feature in all_features))
63
 
64
+ def compare_features(query_features, patent_features):
65
+ common_features = set(query_features) & set(patent_features)
66
+ similarity_score = len(common_features) / max(len(query_features), len(patent_features))
67
+ return common_features, similarity_score
 
 
 
 
 
68
 
69
  def hybrid_search(query, top_k=5):
70
  print(f"Original query: {query}")
71
 
72
+ query_features = extract_key_features(query)
 
 
73
 
74
+ # Encode the query using the transformer model
75
  query_embedding = model.encode([query])[0]
76
  query_embedding = query_embedding / np.linalg.norm(query_embedding)
77
 
 
79
  semantic_distances, semantic_indices = index.search(np.array([query_embedding]), top_k * 2)
80
 
81
  # Perform TF-IDF based search
82
+ query_tfidf = tfidf_vectorizer.transform([query])
83
  tfidf_similarities = cosine_similarity(query_tfidf, tfidf_matrix).flatten()
84
  tfidf_indices = tfidf_similarities.argsort()[-top_k * 2:][::-1]
85
 
 
87
  combined_results = {}
88
  for i, idx in enumerate(semantic_indices[0]):
89
  patent_number = patent_numbers[idx].decode('utf-8')
90
+ text = metadata[patent_number]['text']
91
+ patent_features = extract_key_features(text)
92
+ common_features, feature_similarity = compare_features(query_features, patent_features)
93
+ combined_results[patent_number] = {
94
+ 'score': semantic_distances[0][i] * 1.5 + feature_similarity,
95
+ 'common_features': common_features,
96
+ 'text': text
97
+ }
98
 
99
  for idx in tfidf_indices:
100
  patent_number = patent_numbers[idx].decode('utf-8')
101
+ if patent_number not in combined_results:
102
+ text = metadata[patent_number]['text']
103
+ patent_features = extract_key_features(text)
104
+ common_features, feature_similarity = compare_features(query_features, patent_features)
105
+ combined_results[patent_number] = {
106
+ 'score': tfidf_similarities[idx] + feature_similarity,
107
+ 'common_features': common_features,
108
+ 'text': text
109
+ }
110
 
111
  # Sort and get top results
112
+ top_results = sorted(combined_results.items(), key=lambda x: x[1]['score'], reverse=True)[:top_k]
113
 
114
  results = []
115
+ for patent_number, data in top_results:
 
 
 
 
116
  result = f"Patent Number: {patent_number}\n"
117
+ result += f"Text: {data['text'][:200]}...\n"
118
+ result += f"Combined Score: {data['score']:.4f}\n"
119
+ result += f"Common Key Features: {', '.join(data['common_features'])}\n\n"
120
  results.append(result)
121
 
122
  return "\n".join(results)
 
124
  # Create Gradio interface
125
  iface = gr.Interface(
126
  fn=hybrid_search,
127
+ inputs=gr.Textbox(lines=2, placeholder="Enter your patent query here..."),
128
  outputs=gr.Textbox(lines=10, label="Search Results"),
129
  title="Patent Similarity Search",
130
+ description="Enter a patent description to find similar patents based on key features."
131
  )
132
 
133
  if __name__ == "__main__":