parth parekh commited on
Commit
dd174fa
·
1 Parent(s): fa5ad45

added a process to remove punctuation to improve accuracy

Browse files
Files changed (2) hide show
  1. app.py +8 -4
  2. test.py +57 -35
app.py CHANGED
@@ -12,6 +12,9 @@ app = FastAPI(
12
  docs_url="/"
13
  )
14
 
 
 
 
15
 
16
  class TextInput(BaseModel):
17
  text: str
@@ -36,8 +39,10 @@ def check_regex_patterns(text):
36
  @app.post("/detect_contact", summary="Detect contact information in text")
37
  async def detect_contact(input: TextInput):
38
  try:
 
 
39
  # First, check with regex patterns
40
- if check_regex_patterns(input.text):
41
  return {
42
  "text": input.text,
43
  "contact_probability": 1.0,
@@ -45,9 +50,8 @@ async def detect_contact(input: TextInput):
45
  "method": "regex"
46
  }
47
 
48
- # If no regex patterns match, use the model
49
- # Probability of containing contact info
50
- is_contact = predict(input.text) # You can adjust this threshold as needed
51
  return {
52
  "text": input.text,
53
  "contact_probability": 0.98,
 
12
  docs_url="/"
13
  )
14
 
15
+ def preprocess_text(text):
16
+ # Remove all punctuation except for @ and . which are often used in email addresses
17
+ return re.sub(r'[^\w\s@.]', '', text)
18
 
19
  class TextInput(BaseModel):
20
  text: str
 
39
  @app.post("/detect_contact", summary="Detect contact information in text")
40
  async def detect_contact(input: TextInput):
41
  try:
42
+ preprocessed_text = preprocess_text(input.text)
43
+
44
  # First, check with regex patterns
45
+ if check_regex_patterns(preprocessed_text):
46
  return {
47
  "text": input.text,
48
  "contact_probability": 1.0,
 
50
  "method": "regex"
51
  }
52
 
53
+ # If no regex patterns match, use the model
54
+ is_contact = predict(preprocessed_text)
 
55
  return {
56
  "text": input.text,
57
  "contact_probability": 0.98,
test.py CHANGED
@@ -1,46 +1,68 @@
1
- import requests
 
2
  import json
 
3
 
4
  test_texts = [
5
- "My email is [email protected]",
6
- "Call me at (123) 456-7890",
7
- "I live at 123 Main St, New York, NY 10001",
8
- "Let's meet at the park tomorrow",
9
- "My phone number is 555-1234",
10
- "You can reach me on Skype: user123",
11
- "Reach me at one two three dot four five six dot seven eight nine zero",
12
- "My handle is at_symbol_user_123 on that bird app",
13
- "Drop me a line: first_name (dot) last_name [at] big_search_engine (dot) com",
14
- "Ring me: area code seven-seven-seven then half a dozen, a quartet, and two pairs",
15
- "Find me on the gram: @cool_user_2023",
16
- "I'm on that professional network, just search for John Doe from Acme Corp",
17
- "Send a raven to Winterfell, care of the Stark family",
18
- "Ping me on IRC: /msg CoolDude42",
19
- "You can find me at one two three Fake Street, Anytown, State of Confusion",
20
- "My digits are the first ten prime numbers in order",
21
- "Contact info: tango alpha november golf oscar at yankee alpha hotel oscar oscar dot charlie oscar mike",
22
- "Beep me at 555 (not a real area code) then 867-5309",
23
- "I'm on that app where messages disappear, username: GhostWriter99",
24
- "Reach out via electronic mail to 'surname underscore initial' at that fruit company dot com",
25
- "Call me maybe? Area code is square root of 169, then 555-CHAT",
26
  ]
27
 
28
  url = "https://vidhitmakvana1-contact-sharing-recognizer-api.hf.space/detect_contact"
29
 
30
- for text in test_texts:
31
  payload = {"text": text}
32
  headers = {"Content-Type": "application/json"}
 
 
 
 
 
 
 
 
 
 
33
 
34
- response = requests.post(url, data=json.dumps(payload), headers=headers)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
- if response.status_code == 200:
37
- result = response.json()
38
- print(f"Text: {result['text']}")
39
- print(f"Contact Probability: {result['contact_probability']:.4f}")
40
- print(f"Is Contact Info: {result['is_contact_info']}")
41
- print("---")
42
- else:
43
- print(f"Error for text: {text}")
44
- print(f"Status code: {response.status_code}")
45
- print(f"Response: {response.text}")
46
- print("---")
 
1
+ import asyncio
2
+ import aiohttp
3
  import json
4
+ from tqdm.asyncio import tqdm
5
 
6
  test_texts = [
7
+ "You can reach me at triple eight, then the square of 7, followed by 2^10",
8
+ "Drop a line to first_name [underscore] last_name at that company with a fruit logo dot com",
9
+ "Find me on the platform where professionals connect: J. Doe, Senior Developer at TechCorp",
10
+ "Message me on that app with the ghost icon: @ShadowWhisperer2023",
11
+ "Contact via carrier pigeon: coordinates 40.7128° N, 74.0060° W",
12
+ "Ping me on the federated network: @[email protected]",
13
+ "My contact is the reverse of moc.elpmaxe@eodnhoj",
14
+ "Reach out using morse: -... -.-- -....- . -- .- .. .-..",
15
+ "Find me on the platform with blue checkmarks: @RealJohnDoe (parody)",
16
+ "Send a message to username 'l33tc0d3r' on that platform for developers",
17
+ "You can locate me at the place where the streets have no name, in the city of angels",
18
+ "My digits are the Fibonacci sequence up to 21, concatenated",
19
+ "Contact: foxtrot oscar oscar at bravo alpha romeo dot charlie oscar mike",
20
+ "Beep me at the number you get when you multiply 555 by 1.5, then add 867-5309",
21
+ "I'm on that app where you share shortvideos: @Dancing2023",
22
+ "Reach out via electronic mail to 'lastnamefirstinitial' at that search engine company dot com",
23
+ "Call me at the number you get when you solve this equation: 2x + 5 = 13, then 555-MATH",
24
+ "My handle on that photo-sharing app is @SunsetSnapper_42",
25
+ "You can find me at the intersection of Binary Boulevard and Algorithm Avenue",
26
+ "Contact info: romeo oscar charlie kilo echo tango mike alpha november at zulu uniform lima uniform dot india oscar",
 
27
  ]
28
 
29
  url = "https://vidhitmakvana1-contact-sharing-recognizer-api.hf.space/detect_contact"
30
 
31
+ async def process_text(session, text):
32
  payload = {"text": text}
33
  headers = {"Content-Type": "application/json"}
34
+
35
+ async with session.post(url, data=json.dumps(payload), headers=headers) as response:
36
+ if response.status == 200:
37
+ result = await response.json()
38
+ return result
39
+ else:
40
+ print(f"Error for text: {text}")
41
+ print(f"Status code: {response.status}")
42
+ print(f"Response: {await response.text()}")
43
+ return None
44
 
45
+ async def main():
46
+ async with aiohttp.ClientSession() as session:
47
+ tasks = [process_text(session, text) for text in test_texts]
48
+ results = await tqdm.gather(*tasks)
49
+
50
+ correct_predictions = 0
51
+ total_predictions = len(results)
52
+
53
+ for text, result in zip(test_texts, results):
54
+ if result:
55
+ print(f"Text: {result['text']}")
56
+ print(f"Contact Probability: {result['contact_probability']:.4f}")
57
+ print(f"Is Contact Info: {result['is_contact_info']}")
58
+ print("---")
59
+
60
+ # Assuming all texts in test_texts are actually contact information
61
+ if result['is_contact_info']:
62
+ correct_predictions += 1
63
+
64
+ accuracy = correct_predictions / total_predictions
65
+ print(f"Accuracy: {accuracy:.2f}")
66
 
67
+ if __name__ == "__main__":
68
+ asyncio.run(main())