Spaces:
Sleeping
Sleeping
update event analyzer
Browse files
app.py
CHANGED
@@ -259,18 +259,63 @@ class EventAnalyzer:
|
|
259 |
return self.ontology.validate_pattern(text, 'temporal')
|
260 |
|
261 |
async def extract_locations(self, text):
|
262 |
-
# First await the entities result
|
263 |
entities = await self.extract_entities(text)
|
264 |
ml_locations = entities.get('locations', [])
|
265 |
-
# Get pattern-based locations
|
266 |
pattern_locations = self.ontology.validate_pattern(text, 'location')
|
267 |
return list(set(ml_locations + pattern_locations))
|
268 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
269 |
async def analyze_event(self, text):
|
270 |
try:
|
271 |
# Parallel extraction
|
272 |
entities_future = self.extract_entities(text)
|
273 |
-
temporal_data = self.extract_temporal(text)
|
274 |
locations_future = self.extract_locations(text)
|
275 |
|
276 |
# Gather async results
|
@@ -282,30 +327,52 @@ class EventAnalyzer:
|
|
282 |
entities['locations'] = locations
|
283 |
entities['temporal'] = temporal_data
|
284 |
|
285 |
-
# Calculate initial confidence
|
286 |
-
confidence = min(1.0, (
|
287 |
-
0.2 * bool(entities["people"]) +
|
288 |
-
0.2 * bool(entities["organizations"]) +
|
289 |
-
0.3 * bool(entities["locations"]) +
|
290 |
-
0.3 * bool(temporal_data)
|
291 |
-
))
|
292 |
-
|
293 |
# Find related events
|
294 |
related_events = self.relationship_engine.find_related_events({
|
295 |
'text': text,
|
296 |
'entities': entities
|
297 |
})
|
298 |
|
299 |
-
#
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
|
|
|
307 |
)
|
308 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
309 |
|
310 |
result = {
|
311 |
"text": text,
|
@@ -316,25 +383,58 @@ class EventAnalyzer:
|
|
316 |
{
|
317 |
"text": event[1],
|
318 |
"timestamp": event[2],
|
319 |
-
"confidence": event[3]
|
|
|
320 |
}
|
321 |
for event in related_events
|
322 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
323 |
}
|
324 |
|
325 |
-
# Store event if confidence is sufficient
|
326 |
-
if confidence >= 0.6:
|
327 |
-
self.relationship_engine.conn.execute(
|
328 |
-
'INSERT INTO events (text, timestamp, confidence) VALUES (?, ?, ?)',
|
329 |
-
(text, datetime.now().isoformat(), confidence)
|
330 |
-
)
|
331 |
-
self.relationship_engine.conn.commit()
|
332 |
-
|
333 |
return result
|
334 |
|
335 |
except Exception as e:
|
336 |
return {"error": str(e)}
|
337 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
338 |
# Initialize analyzer
|
339 |
analyzer = EventAnalyzer()
|
340 |
|
|
|
259 |
return self.ontology.validate_pattern(text, 'temporal')
|
260 |
|
261 |
async def extract_locations(self, text):
|
|
|
262 |
entities = await self.extract_entities(text)
|
263 |
ml_locations = entities.get('locations', [])
|
|
|
264 |
pattern_locations = self.ontology.validate_pattern(text, 'location')
|
265 |
return list(set(ml_locations + pattern_locations))
|
266 |
|
267 |
+
def calculate_confidence(self, entities, temporal_data, related_events):
|
268 |
+
# Base confidence from entity presence
|
269 |
+
base_confidence = min(1.0, (
|
270 |
+
0.2 * bool(entities["people"]) +
|
271 |
+
0.2 * bool(entities["organizations"]) +
|
272 |
+
0.3 * bool(entities["locations"]) +
|
273 |
+
0.3 * bool(temporal_data)
|
274 |
+
))
|
275 |
+
|
276 |
+
# Adjust confidence based on entity frequency
|
277 |
+
cursor = self.relationship_engine.conn.execute('''
|
278 |
+
SELECT AVG(frequency) as avg_freq
|
279 |
+
FROM entities
|
280 |
+
WHERE entity_text IN (
|
281 |
+
SELECT DISTINCT entity_text
|
282 |
+
FROM entities
|
283 |
+
WHERE entity_text IN ({})
|
284 |
+
)
|
285 |
+
'''.format(','.join('?' * len([
|
286 |
+
*entities["people"],
|
287 |
+
*entities["organizations"],
|
288 |
+
*entities["locations"]
|
289 |
+
]))))
|
290 |
+
|
291 |
+
avg_frequency = cursor.fetchone()[0] or 1
|
292 |
+
frequency_boost = min(0.2, (avg_frequency - 1) * 0.05) # Max 0.2 boost for frequency
|
293 |
+
|
294 |
+
# Adjust confidence based on relationships
|
295 |
+
relationship_confidence = 0
|
296 |
+
if related_events:
|
297 |
+
relationship_scores = []
|
298 |
+
for event in related_events:
|
299 |
+
cursor = self.relationship_engine.conn.execute('''
|
300 |
+
SELECT COUNT(*) as shared_entities
|
301 |
+
FROM event_entities ee1
|
302 |
+
JOIN event_entities ee2 ON ee1.entity_id = ee2.entity_id
|
303 |
+
WHERE ee1.event_id = ? AND ee2.event_id = ?
|
304 |
+
''', (event[0], event[0])) # event[0] is the event_id
|
305 |
+
shared_count = cursor.fetchone()[0]
|
306 |
+
relationship_scores.append(min(0.3, shared_count * 0.1)) # Max 0.3 boost per relationship
|
307 |
+
|
308 |
+
if relationship_scores:
|
309 |
+
relationship_confidence = max(relationship_scores)
|
310 |
+
|
311 |
+
final_confidence = min(1.0, base_confidence + frequency_boost + relationship_confidence)
|
312 |
+
return final_confidence
|
313 |
+
|
314 |
async def analyze_event(self, text):
|
315 |
try:
|
316 |
# Parallel extraction
|
317 |
entities_future = self.extract_entities(text)
|
318 |
+
temporal_data = self.extract_temporal(text)
|
319 |
locations_future = self.extract_locations(text)
|
320 |
|
321 |
# Gather async results
|
|
|
327 |
entities['locations'] = locations
|
328 |
entities['temporal'] = temporal_data
|
329 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
330 |
# Find related events
|
331 |
related_events = self.relationship_engine.find_related_events({
|
332 |
'text': text,
|
333 |
'entities': entities
|
334 |
})
|
335 |
|
336 |
+
# Calculate confidence with enhanced logic
|
337 |
+
confidence = self.calculate_confidence(entities, temporal_data, related_events)
|
338 |
+
|
339 |
+
# Store event if confidence meets threshold
|
340 |
+
cursor = None
|
341 |
+
if confidence >= 0.6:
|
342 |
+
cursor = self.relationship_engine.conn.execute(
|
343 |
+
'INSERT INTO events (text, timestamp, confidence) VALUES (?, ?, ?)',
|
344 |
+
(text, datetime.now().isoformat(), confidence)
|
345 |
)
|
346 |
+
event_id = cursor.lastrowid
|
347 |
+
|
348 |
+
# Store entities and their relationships
|
349 |
+
self.relationship_engine.store_entities(event_id, {
|
350 |
+
'person': entities['people'],
|
351 |
+
'organization': entities['organizations'],
|
352 |
+
'location': entities['locations'],
|
353 |
+
'temporal': temporal_data,
|
354 |
+
'hashtag': entities['hashtags']
|
355 |
+
})
|
356 |
+
|
357 |
+
# Update entity relationships
|
358 |
+
self.relationship_engine.update_entity_relationships(event_id)
|
359 |
+
|
360 |
+
self.relationship_engine.conn.commit()
|
361 |
+
|
362 |
+
# Get entity relationships for rich output
|
363 |
+
entity_relationships = []
|
364 |
+
if cursor and cursor.lastrowid:
|
365 |
+
query = '''
|
366 |
+
SELECT DISTINCT er.*,
|
367 |
+
e1.entity_text as source_text, e1.entity_type as source_type,
|
368 |
+
e2.entity_text as target_text, e2.entity_type as target_type
|
369 |
+
FROM event_entities ee
|
370 |
+
JOIN entity_relationships er ON ee.entity_id IN (er.source_entity_id, er.target_entity_id)
|
371 |
+
JOIN entities e1 ON er.source_entity_id = e1.id
|
372 |
+
JOIN entities e2 ON er.target_entity_id = e2.id
|
373 |
+
WHERE ee.event_id = ?
|
374 |
+
'''
|
375 |
+
entity_relationships = self.relationship_engine.conn.execute(query, (cursor.lastrowid,)).fetchall()
|
376 |
|
377 |
result = {
|
378 |
"text": text,
|
|
|
383 |
{
|
384 |
"text": event[1],
|
385 |
"timestamp": event[2],
|
386 |
+
"confidence": event[3],
|
387 |
+
"shared_entities": event[4] if len(event) > 4 else None
|
388 |
}
|
389 |
for event in related_events
|
390 |
+
],
|
391 |
+
"entity_relationships": [
|
392 |
+
{
|
393 |
+
"type": rel[3],
|
394 |
+
"source": rel[6],
|
395 |
+
"target": rel[8],
|
396 |
+
"confidence": rel[4]
|
397 |
+
}
|
398 |
+
for rel in entity_relationships
|
399 |
+
] if entity_relationships else []
|
400 |
}
|
401 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
402 |
return result
|
403 |
|
404 |
except Exception as e:
|
405 |
return {"error": str(e)}
|
406 |
|
407 |
+
def get_entity_statistics(self):
|
408 |
+
"""Get statistics about stored entities and relationships"""
|
409 |
+
stats = {}
|
410 |
+
|
411 |
+
# Entity counts by type
|
412 |
+
cursor = self.relationship_engine.conn.execute('''
|
413 |
+
SELECT entity_type, COUNT(*) as count, AVG(frequency) as avg_frequency
|
414 |
+
FROM entities
|
415 |
+
GROUP BY entity_type
|
416 |
+
''')
|
417 |
+
stats['entity_counts'] = cursor.fetchall()
|
418 |
+
|
419 |
+
# Most frequent entities
|
420 |
+
cursor = self.relationship_engine.conn.execute('''
|
421 |
+
SELECT entity_text, entity_type, frequency
|
422 |
+
FROM entities
|
423 |
+
ORDER BY frequency DESC
|
424 |
+
LIMIT 10
|
425 |
+
''')
|
426 |
+
stats['frequent_entities'] = cursor.fetchall()
|
427 |
+
|
428 |
+
# Relationship statistics
|
429 |
+
cursor = self.relationship_engine.conn.execute('''
|
430 |
+
SELECT relationship_type, COUNT(*) as count, AVG(confidence) as avg_confidence
|
431 |
+
FROM entity_relationships
|
432 |
+
GROUP BY relationship_type
|
433 |
+
''')
|
434 |
+
stats['relationship_stats'] = cursor.fetchall()
|
435 |
+
|
436 |
+
return stats
|
437 |
+
|
438 |
# Initialize analyzer
|
439 |
analyzer = EventAnalyzer()
|
440 |
|