File size: 1,957 Bytes
c3af845
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
# src/ontology.py
from typing import List, Dict, Pattern
import re

class OntologyRegistry:
    """Registry for pattern matching and entity validation."""
    
    def __init__(self) -> None:
        self.temporal_patterns: List[str] = [
            r'\b\d{1,2}:\d{2}\s*(?:AM|PM|am|pm)?\b',
            r'\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2}(?:st|nd|rd|th)?,? \d{4}\b',
            r'\btomorrow\b',
            r'\bin \d+ (?:days?|weeks?|months?)\b'
        ]
        
        self.location_patterns: List[str] = [
            r'\b(?:in|at|from|to) ([A-Z][a-zA-Z]+(,? [A-Z]{2})?)\b',
            r'\b[A-Z][a-zA-Z]+ Base\b',
            r'\bHeadquarters\b',
            r'\bHQ\b'
        ]
        
        self.entity_types: Dict[str, str] = {
            'PER': 'person',
            'ORG': 'organization',
            'LOC': 'location',
            'MISC': 'miscellaneous'
        }

        # Compile patterns for better performance
        self._compiled_patterns: Dict[str, List[Pattern]] = {
            'temporal': [re.compile(p) for p in self.temporal_patterns],
            'location': [re.compile(p) for p in self.location_patterns]
        }

    def validate_pattern(self, text: str, pattern_type: str) -> List[str]:
        """
        Validate text against specified pattern type.
        
        Args:
            text: Input text to validate
            pattern_type: Type of pattern to match ('temporal' or 'location')
            
        Returns:
            List of matched strings
        """
        matches = []
        patterns = self._compiled_patterns.get(pattern_type, [])
        
        for pattern in patterns:
            matches.extend(match.group() for match in pattern.finditer(text))
            
        return matches

    def get_entity_type(self, ner_type: str) -> str:
        """Map NER entity type to ontology type."""
        return self.entity_types.get(ner_type, 'miscellaneous')