dwb2023 commited on
Commit
7580068
·
verified ·
1 Parent(s): e8d9eb9

update event analyzer

Browse files
Files changed (1) hide show
  1. app.py +130 -30
app.py CHANGED
@@ -259,18 +259,63 @@ class EventAnalyzer:
259
  return self.ontology.validate_pattern(text, 'temporal')
260
 
261
  async def extract_locations(self, text):
262
- # First await the entities result
263
  entities = await self.extract_entities(text)
264
  ml_locations = entities.get('locations', [])
265
- # Get pattern-based locations
266
  pattern_locations = self.ontology.validate_pattern(text, 'location')
267
  return list(set(ml_locations + pattern_locations))
268
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
269
  async def analyze_event(self, text):
270
  try:
271
  # Parallel extraction
272
  entities_future = self.extract_entities(text)
273
- temporal_data = self.extract_temporal(text) # This is synchronous now
274
  locations_future = self.extract_locations(text)
275
 
276
  # Gather async results
@@ -282,30 +327,52 @@ class EventAnalyzer:
282
  entities['locations'] = locations
283
  entities['temporal'] = temporal_data
284
 
285
- # Calculate initial confidence
286
- confidence = min(1.0, (
287
- 0.2 * bool(entities["people"]) +
288
- 0.2 * bool(entities["organizations"]) +
289
- 0.3 * bool(entities["locations"]) +
290
- 0.3 * bool(temporal_data)
291
- ))
292
-
293
  # Find related events
294
  related_events = self.relationship_engine.find_related_events({
295
  'text': text,
296
  'entities': entities
297
  })
298
 
299
- # Adjust confidence based on relationships
300
- if related_events:
301
- relationship_confidence = max(
302
- self.relationship_engine.calculate_relationship_confidence(
303
- {'entities': entities},
304
- {'text': event[1]}
305
- )
306
- for event in related_events
 
307
  )
308
- confidence = (confidence + relationship_confidence) / 2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
309
 
310
  result = {
311
  "text": text,
@@ -316,25 +383,58 @@ class EventAnalyzer:
316
  {
317
  "text": event[1],
318
  "timestamp": event[2],
319
- "confidence": event[3]
 
320
  }
321
  for event in related_events
322
- ]
 
 
 
 
 
 
 
 
 
323
  }
324
 
325
- # Store event if confidence is sufficient
326
- if confidence >= 0.6:
327
- self.relationship_engine.conn.execute(
328
- 'INSERT INTO events (text, timestamp, confidence) VALUES (?, ?, ?)',
329
- (text, datetime.now().isoformat(), confidence)
330
- )
331
- self.relationship_engine.conn.commit()
332
-
333
  return result
334
 
335
  except Exception as e:
336
  return {"error": str(e)}
337
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
338
  # Initialize analyzer
339
  analyzer = EventAnalyzer()
340
 
 
259
  return self.ontology.validate_pattern(text, 'temporal')
260
 
261
  async def extract_locations(self, text):
 
262
  entities = await self.extract_entities(text)
263
  ml_locations = entities.get('locations', [])
 
264
  pattern_locations = self.ontology.validate_pattern(text, 'location')
265
  return list(set(ml_locations + pattern_locations))
266
 
267
+ def calculate_confidence(self, entities, temporal_data, related_events):
268
+ # Base confidence from entity presence
269
+ base_confidence = min(1.0, (
270
+ 0.2 * bool(entities["people"]) +
271
+ 0.2 * bool(entities["organizations"]) +
272
+ 0.3 * bool(entities["locations"]) +
273
+ 0.3 * bool(temporal_data)
274
+ ))
275
+
276
+ # Adjust confidence based on entity frequency
277
+ cursor = self.relationship_engine.conn.execute('''
278
+ SELECT AVG(frequency) as avg_freq
279
+ FROM entities
280
+ WHERE entity_text IN (
281
+ SELECT DISTINCT entity_text
282
+ FROM entities
283
+ WHERE entity_text IN ({})
284
+ )
285
+ '''.format(','.join('?' * len([
286
+ *entities["people"],
287
+ *entities["organizations"],
288
+ *entities["locations"]
289
+ ]))))
290
+
291
+ avg_frequency = cursor.fetchone()[0] or 1
292
+ frequency_boost = min(0.2, (avg_frequency - 1) * 0.05) # Max 0.2 boost for frequency
293
+
294
+ # Adjust confidence based on relationships
295
+ relationship_confidence = 0
296
+ if related_events:
297
+ relationship_scores = []
298
+ for event in related_events:
299
+ cursor = self.relationship_engine.conn.execute('''
300
+ SELECT COUNT(*) as shared_entities
301
+ FROM event_entities ee1
302
+ JOIN event_entities ee2 ON ee1.entity_id = ee2.entity_id
303
+ WHERE ee1.event_id = ? AND ee2.event_id = ?
304
+ ''', (event[0], event[0])) # event[0] is the event_id
305
+ shared_count = cursor.fetchone()[0]
306
+ relationship_scores.append(min(0.3, shared_count * 0.1)) # Max 0.3 boost per relationship
307
+
308
+ if relationship_scores:
309
+ relationship_confidence = max(relationship_scores)
310
+
311
+ final_confidence = min(1.0, base_confidence + frequency_boost + relationship_confidence)
312
+ return final_confidence
313
+
314
  async def analyze_event(self, text):
315
  try:
316
  # Parallel extraction
317
  entities_future = self.extract_entities(text)
318
+ temporal_data = self.extract_temporal(text)
319
  locations_future = self.extract_locations(text)
320
 
321
  # Gather async results
 
327
  entities['locations'] = locations
328
  entities['temporal'] = temporal_data
329
 
 
 
 
 
 
 
 
 
330
  # Find related events
331
  related_events = self.relationship_engine.find_related_events({
332
  'text': text,
333
  'entities': entities
334
  })
335
 
336
+ # Calculate confidence with enhanced logic
337
+ confidence = self.calculate_confidence(entities, temporal_data, related_events)
338
+
339
+ # Store event if confidence meets threshold
340
+ cursor = None
341
+ if confidence >= 0.6:
342
+ cursor = self.relationship_engine.conn.execute(
343
+ 'INSERT INTO events (text, timestamp, confidence) VALUES (?, ?, ?)',
344
+ (text, datetime.now().isoformat(), confidence)
345
  )
346
+ event_id = cursor.lastrowid
347
+
348
+ # Store entities and their relationships
349
+ self.relationship_engine.store_entities(event_id, {
350
+ 'person': entities['people'],
351
+ 'organization': entities['organizations'],
352
+ 'location': entities['locations'],
353
+ 'temporal': temporal_data,
354
+ 'hashtag': entities['hashtags']
355
+ })
356
+
357
+ # Update entity relationships
358
+ self.relationship_engine.update_entity_relationships(event_id)
359
+
360
+ self.relationship_engine.conn.commit()
361
+
362
+ # Get entity relationships for rich output
363
+ entity_relationships = []
364
+ if cursor and cursor.lastrowid:
365
+ query = '''
366
+ SELECT DISTINCT er.*,
367
+ e1.entity_text as source_text, e1.entity_type as source_type,
368
+ e2.entity_text as target_text, e2.entity_type as target_type
369
+ FROM event_entities ee
370
+ JOIN entity_relationships er ON ee.entity_id IN (er.source_entity_id, er.target_entity_id)
371
+ JOIN entities e1 ON er.source_entity_id = e1.id
372
+ JOIN entities e2 ON er.target_entity_id = e2.id
373
+ WHERE ee.event_id = ?
374
+ '''
375
+ entity_relationships = self.relationship_engine.conn.execute(query, (cursor.lastrowid,)).fetchall()
376
 
377
  result = {
378
  "text": text,
 
383
  {
384
  "text": event[1],
385
  "timestamp": event[2],
386
+ "confidence": event[3],
387
+ "shared_entities": event[4] if len(event) > 4 else None
388
  }
389
  for event in related_events
390
+ ],
391
+ "entity_relationships": [
392
+ {
393
+ "type": rel[3],
394
+ "source": rel[6],
395
+ "target": rel[8],
396
+ "confidence": rel[4]
397
+ }
398
+ for rel in entity_relationships
399
+ ] if entity_relationships else []
400
  }
401
 
 
 
 
 
 
 
 
 
402
  return result
403
 
404
  except Exception as e:
405
  return {"error": str(e)}
406
 
407
+ def get_entity_statistics(self):
408
+ """Get statistics about stored entities and relationships"""
409
+ stats = {}
410
+
411
+ # Entity counts by type
412
+ cursor = self.relationship_engine.conn.execute('''
413
+ SELECT entity_type, COUNT(*) as count, AVG(frequency) as avg_frequency
414
+ FROM entities
415
+ GROUP BY entity_type
416
+ ''')
417
+ stats['entity_counts'] = cursor.fetchall()
418
+
419
+ # Most frequent entities
420
+ cursor = self.relationship_engine.conn.execute('''
421
+ SELECT entity_text, entity_type, frequency
422
+ FROM entities
423
+ ORDER BY frequency DESC
424
+ LIMIT 10
425
+ ''')
426
+ stats['frequent_entities'] = cursor.fetchall()
427
+
428
+ # Relationship statistics
429
+ cursor = self.relationship_engine.conn.execute('''
430
+ SELECT relationship_type, COUNT(*) as count, AVG(confidence) as avg_confidence
431
+ FROM entity_relationships
432
+ GROUP BY relationship_type
433
+ ''')
434
+ stats['relationship_stats'] = cursor.fetchall()
435
+
436
+ return stats
437
+
438
  # Initialize analyzer
439
  analyzer = EventAnalyzer()
440