parth parekh commited on
Commit
7e63028
0 Parent(s):

working demo

Browse files
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .venv
Dockerfile ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.12-slim
2
+
3
+ # Create a new user
4
+ RUN useradd -m user
5
+
6
+ WORKDIR /app
7
+
8
+ RUN apt-get update && apt-get install -y \
9
+ libglib2.0-0 \
10
+ libsm6 \
11
+ libxext6 \
12
+ libxrender-dev \
13
+ libgl1-mesa-glx \
14
+ wget \
15
+ && rm -rf /var/lib/apt/lists/*
16
+
17
+ COPY requirements.txt .
18
+ RUN pip install --no-cache-dir -r requirements.txt
19
+
20
+ COPY . .
21
+
22
+ # Change ownership of the /app directory to the new user
23
+ RUN chown -R user:user /app
24
+
25
+ # Switch to the new user
26
+ USER user
27
+
28
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "4"]
README.md ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Contact Sharing Recognizer API
3
+ emoji: 🤙
4
+ colorFrom: indigo
5
+ colorTo: pink
6
+ sdk: docker
7
+ pinned: false
8
+ ---
9
+
10
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
__pycache__/app.cpython-312.pyc ADDED
Binary file (4.43 kB). View file
 
__pycache__/predictor.cpython-312.pyc ADDED
Binary file (9.06 kB). View file
 
__pycache__/test.cpython-312.pyc ADDED
Binary file (11.6 kB). View file
 
app.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException
2
+ from pydantic import BaseModel
3
+ import torch
4
+ from torch.nn.functional import softmax
5
+ import re
6
+ from predictor import predict, batch_predict # Assuming batch_predict is in predictor module
7
+
8
+ app = FastAPI(
9
+ title="Contact Information Detection API",
10
+ description="API for detecting contact information in text, great thanks to xxparthparekhxx/ContactShieldAI for the model",
11
+ version="1.0.0",
12
+ docs_url="/"
13
+ )
14
+
15
+ def preprocess_text(text):
16
+ # Remove all punctuation except for @ and . which are often used in email addresses
17
+ return re.sub(r'[^\w\s@.]', '', text)
18
+
19
+ class TextInput(BaseModel):
20
+ text: str
21
+
22
+ class BatchTextInput(BaseModel):
23
+ texts: list[str]
24
+
25
+ def check_regex_patterns(text):
26
+ patterns = [
27
+ r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', # Email
28
+ r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', # Phone number
29
+ r'\b\d{5}(?:[-\s]\d{4})?\b', # ZIP code
30
+ r'\b\d+\s+[\w\s]+(?:street|st|avenue|ave|road|rd|highway|hwy|square|sq|trail|trl|drive|dr|court|ct|park|parkway|pkwy|circle|cir|boulevard|blvd)\b\s*(?:[a-z]+\s*\d{1,3})?(?:,\s*(?:apt|bldg|dept|fl|hngr|lot|pier|rm|ste|unit|#)\s*[a-z0-9-]+)?(?:,\s*[a-z]+\s*[a-z]{2}\s*\d{5}(?:-\d{4})?)?', # Street address
31
+ r'(?:http|https)://(?:www\.)?[a-zA-Z0-9-]+\.[a-zA-Z]{2,}(?:/[^\s]*)?' # Website URL
32
+ ]
33
+
34
+ for pattern in patterns:
35
+ if re.search(pattern, text, re.IGNORECASE):
36
+ return True
37
+ return False
38
+
39
+ @app.post("/detect_contact", summary="Detect contact information in text")
40
+ async def detect_contact(input: TextInput):
41
+ try:
42
+ preprocessed_text = preprocess_text(input.text)
43
+
44
+ # First, check with regex patterns
45
+ if check_regex_patterns(preprocessed_text):
46
+ return {
47
+ "text": input.text,
48
+ "is_contact_info": True,
49
+ "method": "regex"
50
+ }
51
+
52
+ # If no regex patterns match, use the model
53
+ is_contact = predict(preprocessed_text)
54
+ return {
55
+ "text": input.text,
56
+ "is_contact_info": is_contact == 1,
57
+ "method": "model"
58
+ }
59
+ except Exception as e:
60
+ raise HTTPException(status_code=500, detail=str(e))
61
+
62
+ @app.post("/batch_detect_contact", summary="Detect contact information in batch of texts")
63
+ async def batch_detect_contact(inputs: BatchTextInput):
64
+ try:
65
+ # Preprocess all texts
66
+ preprocessed_texts = [preprocess_text(text) for text in inputs.texts]
67
+
68
+ # First, use regex to check patterns
69
+ regex_results = [check_regex_patterns(text) for text in preprocessed_texts]
70
+
71
+
72
+ # For texts where regex doesn't detect anything, use the model
73
+ texts_for_model = [text for text, regex_match in zip(preprocessed_texts, regex_results) if not regex_match]
74
+ if texts_for_model:
75
+ model_results = batch_predict(texts_for_model)
76
+ else:
77
+ model_results = []
78
+
79
+ # Prepare final results
80
+ results = []
81
+ model_idx = 0
82
+ for i, text in enumerate(preprocessed_texts):
83
+ if regex_results[i]:
84
+ results.append({
85
+ "text": inputs.texts[i],
86
+ "is_contact_info": True,
87
+ "method": "regex"
88
+ })
89
+ else:
90
+ is_contact = model_results[model_idx]
91
+ results.append({
92
+ "text": inputs.texts[i],
93
+ "is_contact_info": bool(is_contact), # Convert numpy bool
94
+ "method": "model"
95
+ })
96
+ model_idx += 1
97
+
98
+ return results
99
+ except Exception as e:
100
+ raise HTTPException(status_code=500, detail=str(e))
contact_sharing_epoch_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bdb70e711c212856ce3df95b82afbae57b8fc34243b3f541ecd65963fa81fd92
3
+ size 813497259
load_test.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import aiohttp
3
+ import json
4
+ from tqdm.asyncio import tqdm
5
+ import time
6
+ from test import test_texts
7
+
8
+ url = "https://vidhitmakvana1-contact-sharing-recognizer-api.hf.space/detect_contact"
9
+ concurrent_requests = 2
10
+
11
+ async def process_text(session, text, semaphore):
12
+ payload = {"text": text}
13
+ headers = {"Content-Type": "application/json"}
14
+
15
+ async with semaphore:
16
+ start_time = time.time()
17
+ while True:
18
+ async with session.post(url, data=json.dumps(payload), headers=headers) as response:
19
+ if response.status == 200:
20
+ result = await response.json()
21
+ end_time = time.time()
22
+ result['response_time'] = end_time - start_time
23
+ return result
24
+ elif response.status == 429:
25
+ print(f"Rate limit exceeded. Waiting for 60 seconds before retrying...")
26
+ await asyncio.sleep(60)
27
+ else:
28
+ print(f"Error for text: {text}")
29
+ print(f"Status code: {response.status}")
30
+ print(f"Response: {await response.text()}")
31
+ return None
32
+
33
+ async def main():
34
+ semaphore = asyncio.Semaphore(concurrent_requests)
35
+ async with aiohttp.ClientSession() as session:
36
+ tasks = [process_text(session, text, semaphore) for text in [*test_texts,*test_texts,*test_texts,*test_texts,*test_texts,*test_texts,*test_texts,*test_texts,*test_texts,*test_texts,*test_texts,*test_texts,*test_texts,*test_texts,*test_texts,*test_texts,*test_texts,*test_texts,*test_texts,*test_texts,*test_texts,*test_texts,*test_texts,*test_texts,*test_texts,*test_texts,*test_texts,*test_texts,*test_texts,*test_texts,*test_texts,*test_texts,*test_texts,*test_texts,*test_texts,*test_texts]]
37
+ results = await tqdm.gather(*tasks)
38
+
39
+ correct_predictions = 0
40
+ total_predictions = len(results)
41
+ total_response_time = 0
42
+
43
+ for text, result in zip(test_texts, results):
44
+ if result:
45
+ print(f"Text: {result['text']}")
46
+ print(f"Contact Probability: {result['contact_probability']:.4f}")
47
+ print(f"Is Contact Info: {result['is_contact_info']}")
48
+ print(f"Response Time: {result['response_time']:.4f} seconds")
49
+ print("---")
50
+
51
+ if result['is_contact_info']:
52
+ correct_predictions += 1
53
+
54
+ total_response_time += result['response_time']
55
+
56
+ accuracy = correct_predictions / (total_predictions * 37)
57
+ average_response_time = total_response_time / total_predictions
58
+ print(f"Accuracy: {accuracy:.2f}")
59
+ print(f"Average Response Time: {average_response_time:.4f} seconds")
60
+
61
+ if __name__ == "__main__":
62
+ while True:
63
+ start_time = time.time()
64
+ asyncio.run(main())
65
+ end_time = time.time()
66
+ total_time = end_time - start_time
67
+ print(f"\nTotal execution time: {total_time:.2f} seconds")
predictor.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+ from torchtext.vocab import build_vocab_from_iterator, GloVe
5
+ from torchtext.data.utils import get_tokenizer
6
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
7
+
8
+ class ContactSharingClassifier(nn.Module):
9
+ def __init__(self, vocab_size, embed_dim, num_filters, filter_sizes, lstm_hidden_dim, output_dim, dropout, pad_idx):
10
+ super().__init__()
11
+ self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
12
+ self.lstm = nn.LSTM(embed_dim, lstm_hidden_dim, bidirectional=True, batch_first=True)
13
+ self.convs = nn.ModuleList([
14
+ nn.Conv1d(in_channels=lstm_hidden_dim*2, out_channels=num_filters, kernel_size=fs)
15
+ for fs in filter_sizes
16
+ ])
17
+ self.fc1 = nn.Linear(len(filter_sizes) * num_filters, len(filter_sizes) * num_filters // 2)
18
+ self.fc2 = nn.Linear(len(filter_sizes) * num_filters // 2, output_dim)
19
+ self.dropout = nn.Dropout(dropout)
20
+ self.layer_norm = nn.LayerNorm(len(filter_sizes) * num_filters)
21
+
22
+ def forward(self, text):
23
+ embedded = self.embedding(text)
24
+ lstm_out, _ = self.lstm(embedded)
25
+ lstm_out = lstm_out.permute(0, 2, 1)
26
+ conved = [F.relu(conv(lstm_out)) for conv in self.convs]
27
+ pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
28
+ cat = self.dropout(torch.cat(pooled, dim=1))
29
+ cat = self.layer_norm(cat)
30
+ x = F.relu(self.fc1(cat))
31
+ x = self.dropout(x)
32
+ return self.fc2(x)
33
+
34
+ # Initialize tokenizer and vocabulary
35
+ tokenizer = get_tokenizer("spacy", language="en_core_web_sm")
36
+ vocab = torch.load('vocab.pth') # Assuming you've saved the vocabulary
37
+
38
+ # Define text pipeline
39
+ def text_pipeline(x):
40
+ return [vocab[token] for token in tokenizer(x)]
41
+
42
+ # Model parameters
43
+ VOCAB_SIZE = len(vocab)
44
+ EMBED_DIM = 600
45
+ NUM_FILTERS = 600
46
+ FILTER_SIZES = [3, 4, 5, 6, 7, 8, 9, 10]
47
+ LSTM_HIDDEN_DIM = 768
48
+ OUTPUT_DIM = 2
49
+ DROPOUT = 0.5
50
+ PAD_IDX = vocab["<pad>"]
51
+
52
+ # Load the model
53
+
54
+ model = ContactSharingClassifier(VOCAB_SIZE, EMBED_DIM, NUM_FILTERS, FILTER_SIZES, LSTM_HIDDEN_DIM, OUTPUT_DIM, DROPOUT, PAD_IDX)
55
+ model.load_state_dict(torch.load('contact_sharing_epoch_1.pth', map_location=device))
56
+ model.to(device)
57
+ model.eval()
58
+
59
+ # Test sentences
60
+ test_sentences = [
61
+ "You can reach me at my electronic mail address, it's my first name dot last name at that popular search engine company's mail service.",
62
+ "Call me on my cellular device, the digits are the same as the year the Declaration of Independence was signed, followed by my birth year, twice.",
63
+ "Visit my online presence at triple w dot my full name without spaces or punctuation dot com.",
64
+ "Send a message to username 'not_my_real_name' on that instant messaging platform that starts with 'disc' and ends with 'ord'.",
65
+ "My contact info is hidden in this sentence: Eight Six Seven Five Three Oh Nine.",
66
+ "Find me on the professional networking site, just search for my name plus 'software engineer in San Francisco'.",
67
+ "My handle on the bird-themed social media platform is at symbol followed by 'definitely_not_my_email_address'.",
68
+ "You know that video sharing site? My channel is there, just add 'cool_coder_' before my full name, all lowercase.",
69
+ "I'm listed in the phone book under 'Smith, John' but replace 'Smith' with my actual last name and 'John' with my first name.",
70
+ "My contact details are encrypted: Rot13('[email protected]')",
71
+
72
+ # New non-contact sharing examples
73
+ "The weather today is absolutely beautiful, perfect for a picnic in the park.",
74
+ "I'm really excited about the new sci-fi movie coming out next month.",
75
+ "Did you hear about the latest advancements in artificial intelligence? It's fascinating!",
76
+ "I'm planning to go hiking this weekend in the nearby mountains.",
77
+ "The recipe calls for two cups of flour and a pinch of salt.",
78
+ "The annual tech conference will be held virtually this year due to ongoing health concerns.",
79
+ "I've been learning to play the guitar for the past six months. It's challenging but rewarding.",
80
+ "The local farmer's market has the freshest produce every Saturday morning.",
81
+ "Did you catch the game last night? It was an incredible comeback in the final quarter!",
82
+ "Lets do '42069' tonight it will be really fun what do you say ?"
83
+ ]
84
+
85
+ # JIT Script the model for faster inference
86
+ scripted_model = torch.jit.script(model)
87
+
88
+ # Preallocate padding tensor to avoid repeated memory allocation
89
+ MAX_LEN = max(FILTER_SIZES)
90
+ padding_tensor = torch.zeros(1, MAX_LEN, dtype=torch.long).to(device)
91
+
92
+ # Prediction function using JIT and inference optimizations
93
+ def predict(text):
94
+ with torch.inference_mode(): # Use inference mode instead of no_grad
95
+ inputs = torch.tensor([text_pipeline(text)]).to(device)
96
+
97
+ # Perform padding if necessary
98
+ if inputs.size(1) < MAX_LEN:
99
+ inputs = torch.cat([inputs, padding_tensor[:, :MAX_LEN - inputs.size(1)]], dim=1)
100
+
101
+ # Pass inputs through the scripted model
102
+ outputs = scripted_model(inputs)
103
+
104
+ # Return predicted class
105
+ return torch.argmax(outputs, dim=1).item()
106
+
107
+ def batch_predict(texts):
108
+ with torch.inference_mode(): # Use inference mode for better performance
109
+ # Tokenize and convert to tensors
110
+ inputs = [torch.tensor(text_pipeline(text)) for text in texts]
111
+
112
+ # Pad all sequences to the length of the longest one in the batch
113
+ max_len = max(len(seq) for seq in inputs)
114
+ padded_inputs = torch.stack([torch.cat([seq, torch.zeros(max_len - len(seq), dtype=torch.long)]) for seq in inputs]).to(device)
115
+
116
+ # Pass the batch through the scripted model
117
+ outputs = scripted_model(padded_inputs)
118
+
119
+ # Return predicted classes for each sentence
120
+ predictions = torch.argmax(outputs, dim=1).cpu().numpy()
121
+ return predictions
122
+
123
+ # Test the sentences
124
+ for i, sentence in enumerate(test_sentences, 1):
125
+ prediction = predict(sentence)
126
+ result = "Contains contact info" if prediction == 1 else "No contact info"
127
+ print(f"Sentence {i}: {result}")
128
+ print(f"Text: {sentence}\n")
requirements.txt ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ annotated-types==0.7.0
2
+ anyio==4.6.0
3
+ blis==0.7.11
4
+ catalogue==2.0.10
5
+ certifi==2024.8.30
6
+ charset-normalizer==3.3.2
7
+ click==8.1.7
8
+ cloudpathlib==0.19.0
9
+ colorama==0.4.6
10
+ confection==0.1.5
11
+ cymem==2.0.8
12
+ distro==1.9.0
13
+ en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl#sha256=86cc141f63942d4b2c5fcee06630fd6f904788d2f0ab005cce45aadb8fb73889
14
+ fastapi==0.115.0
15
+ filelock==3.13.1
16
+ fsspec==2024.2.0
17
+ greenlet==3.1.1
18
+ groq==0.11.0
19
+ h11==0.14.0
20
+ httpcore==1.0.5
21
+ httpx==0.27.2
22
+ huggingface-hub==0.25.1
23
+ idna==3.10
24
+ Jinja2==3.1.3
25
+ langcodes==3.4.0
26
+ language_data==1.2.0
27
+ marisa-trie==1.2.0
28
+ markdown-it-py==3.0.0
29
+ MarkupSafe==2.1.5
30
+ mdurl==0.1.2
31
+ mpmath==1.3.0
32
+ murmurhash==1.0.10
33
+ networkx==3.2.1
34
+ numpy==1.26.3
35
+ packaging==24.1
36
+ pillow==10.2.0
37
+ preshed==3.0.9
38
+ pydantic==2.9.2
39
+ pydantic_core==2.23.4
40
+ Pygments==2.18.0
41
+ PyYAML==6.0.2
42
+ regex==2024.9.11
43
+ requests==2.32.3
44
+ rich==13.8.1
45
+ safetensors==0.4.5
46
+ setuptools==70.0.0
47
+ shellingham==1.5.4
48
+ smart-open==7.0.4
49
+ sniffio==1.3.1
50
+ spacy==3.7.6
51
+ spacy-legacy==3.0.12
52
+ spacy-loggers==1.0.5
53
+ SQLAlchemy==2.0.35
54
+ srsly==2.4.8
55
+ starlette==0.38.5
56
+ sympy==1.12
57
+ thinc==8.2.5
58
+ tokenizers==0.19.1
59
+ torch==2.2.0
60
+ torchdata==0.7.1
61
+ torchtext==0.16.2
62
+ tqdm==4.66.5
63
+ transformers==4.44.2
64
+ typer==0.12.5
65
+ typing_extensions==4.12.2
66
+ urllib3==2.2.3
67
+ uvicorn==0.30.6
68
+ wasabi==1.1.3
69
+ weasel==0.4.1
70
+ wrapt==1.16.0
test.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import aiohttp
3
+ import json
4
+ from tqdm.asyncio import tqdm
5
+
6
+ test_texts = [
7
+ "You can reach me at triple eight, then the square of 7, followed by 2^10",
8
+ "Drop a line to first_name [underscore] last_name at that company with a fruit logo dot com",
9
+ "Find me on the platform where professionals connect: J. Doe, Senior Developer at TechCorp",
10
+ "Message me on that app with the ghost icon: @ShadowWhisperer2023",
11
+ "Contact via carrier pigeon: coordinates 40.7128° N, 74.0060° W",
12
+ "Ping me on the federated network: @[email protected]",
13
+ "My contact is the reverse of moc.elpmaxe@eodnhoj",
14
+ "Reach out using morse: -... -.-- -....- . -- .- .. .-..",
15
+ "Find me on the platform with blue checkmarks: @RealJohnDoe (parody)",
16
+ "Send a message to username 'l33tc0d3r' on that platform for developers",
17
+ "You can locate me at the place where the streets have no name, in the city of angels",
18
+ "My digits are the Fibonacci sequence up to 21, concatenated",
19
+ "Contact: foxtrot oscar oscar at bravo alpha romeo dot charlie oscar mike",
20
+ "Beep me at the number you get when you multiply 555 by 1.5, then add 867-5309",
21
+ "I'm on that app where you share shortvideos: @Dancing2023",
22
+ "Reach out via electronic mail to 'lastnamefirstinitial' at that search engine company dot com",
23
+ "Call me at the number you get when you solve this equation: 2x + 5 = 13, then 555-MATH",
24
+ "My handle on that photo-sharing app is @SunsetSnapper_42",
25
+ "You can find me at the intersection of Binary Boulevard and Algorithm Avenue",
26
+ "Contact info: romeo oscar charlie kilo echo tango mike alpha november at zulu uniform lima uniform dot india oscar",
27
+ "Find me at 51.4778° N, and solve for x: x - 0.0019 = 0.1278° W",
28
+ "DM me at 📧👤💻🐦. Guess the platform 😉",
29
+ "If you add 2 to the area code of Los Angeles, you'll find the first 3 digits of my number",
30
+ "Ping me on the platform with 2 birds in its logo (and no, it's not a zoo!)",
31
+ "You can decode my email address: base64 for JmRvZGVAc2FtcGxlLmNvbQ==",
32
+ "You’ll find me on the platform that rhymes with 'squeaker' and involves chirps",
33
+ "If you reverse the letters of com.gmail@john and remove 'moc', you'll get my contact",
34
+ "For inquiries, try contacting me at 'first name.last name', but think of the sound fruit makes when it's dropped",
35
+ "Use morse to reach out: dash dot dot dash underscore underscore dash dot dot (first name at techcorp dot com)",
36
+ "Contact: solve 5x - 3 = 12 for x, that’s my lucky number for the area code, followed by the square root of 144 for the rest",
37
+ "Reach out on that site where professionals connect, my name rhymes with 'noe' and I’m a senior engineer at T-Corp",
38
+ "Shoot me a message on the photo-sharing app where sunsets get all the likes: handle is the same as my name in reverse",
39
+ "If you count the number of words in 'five stars' you'll get the first two digits of my handle on that coding platform",
40
+ "My email is hidden: find the cube root of 27, followed by the first name of a famous fruit and 'dot com'",
41
+ "If you multiply the number of days in March by 5, you'll get my contact digits",
42
+ "Contact me on the short-video app, my handle starts with a 'D' and ends with '23'!",
43
+ "Try to find me where algorithms reign and the search begins: think of a query that contains my last name and 'solutions'",
44
+ "Use binary to get my location: 01000101 01001110 01000111",
45
+ "You can ping me at 'bestcoder42' on the app where code flows like water",
46
+ "My digits: sum of first four Fibonacci numbers for the area code, and the next three for the phone number",
47
+ "Find me at 51.4778° N, and solve for x: x - 0.0019 = 0.1278° W",
48
+ "DM me at 📧👤💻🐦. Guess the platform 😉",
49
+ "If you add 2 to the area code of Los Angeles, you'll find the rest of my digits hiding nearby",
50
+ "Ping me at 'FirstnameLastname reversed' at that search company 🧐",
51
+ "The sum of the first two primes gives you the first part of my number, and 10 squared gets you the rest",
52
+ "Drop a message on the 'app named after a bird' to @JohnDoe2024 🌐",
53
+ "Morse this one: .... . .-.. .-.. --- @ secret-agent",
54
+ "Let’s connect: 3rd letter of my last name, then an underscore, then my birth year at fruit-company dot com 🍏",
55
+ "I'm on the platform for professionals but my handle is just a smiley face, hint hint 😉",
56
+ "Look up the coordinates of Big Ben and you might just find where I hang out 🕰️",
57
+ "Combine the atomic number of helium with my favorite fruit and you'll get my email",
58
+ "Find me at 'underscore emoji fan' at the app where people share funny short videos 🤳",
59
+ "Think of the number 404, then multiply it by 2, that’s the area code. The rest is easy!",
60
+ "I'm always up for a chat, just decode 01000011 01100001 01101100 01101100",
61
+ "I’m @SilentWhisper42 on the app where conversations vanish into thin air 👻",
62
+ "Track me down with this: Alpha-Bravo-Charlie at that company with flying machines ✈️",
63
+ "Ever heard of Fibonacci? My digits follow the pattern, up to 21",
64
+ "Search for the name of the singer of 'Rocket Man,' and you'll have part of my contact info 🧑‍🚀",
65
+ "Just send a message to 'TechWizard' at the email service that rhymes with whale-mail 🐋",
66
+ "My username on that site for devs is 'leet_hacker', but you’ll need to solve for x to figure out the rest!",
67
+ "My digits? Picture the number of planets in the solar system before 2006, then square it.",
68
+ "If you know the atomic numbers of oxygen and hydrogen, combine them and you have my first two digits.",
69
+ "Contact me where knowledge is power, at the symbol of illumination followed by 'dot org'.",
70
+ "I'm @user and you'll find me on the app where one tweets, but reverse that bird's sound first.",
71
+ "Think of the area code for New York, subtract one, and you’re almost there.",
72
+ "Reach out at 'wizard@', then imagine the home of the brave and the land of the free, followed by 'com'.",
73
+ "My handle is a palindrome on that platform where people share their lives one square at a time.",
74
+ "Find me at the intersection of 7 squared and the cube root of 8, you'll know the digits.",
75
+ "Ping me at 'Firstname reversed' dot 'company with a shopping cart logo'.",
76
+ "Send a message to the name of the president in 1993 at the platform where developers share code.",
77
+ "You can reach me at the number that shares its name with a famous Chicago bull, then add 10.",
78
+ "Reach out on the platform with the blue checkmarks, where I’m known as '56/8'.",
79
+ "The username is easy if you know your ASCII: 83 117 110 83 101 116 52 50.",
80
+ "For contact info, divide the year Armstrong walked on the moon by two and add the last prime number.",
81
+ "You can email me at the world's largest retailer with a name that rhymes with 'Hamazon'.",
82
+ "Catch me on the app where professionals hang out: it’s the opposite of 'InTouch'.",
83
+ "Look for me on the 'bird app' where my handle is my initials followed by the number of days in a leap year.",
84
+ "Ping me at 'Jupiter's largest moon' dot 'the company that sends rockets into space'.",
85
+ "Reach out at the sum of the angles in a triangle, followed by 'degrees at mail dot com'.",
86
+ "Message me where bytes are shared: I go by '@user_hexadecimal_4D2' on that site.",
87
+ "You can send it to 🌍 world_dot_explorer @ 'web page where you explore the world'.",
88
+ "数字 4 (Chinese), then 'underscore', then 'techie' at the search giant.",
89
+ "Write to me at the country with a maple leaf symbol, at their email provider.",
90
+ "Feel free to ping me at Жака at mail dot ru (that’s Russian for Jack).",
91
+ "If you take the French word for 'sun' and add 'shine', that’s where you can reach me.",
92
+ "My digits? They hide in plain sight: 42-4*8+18. Just subtract the stars.",
93
+ "Drop me a line at 'developer' followed by the country code for India, dot com.",
94
+ "Where to find me? It's obvious: 'who's' dot 'this', at the dot that ends with 'gov'.",
95
+ "A long story short: email me at 'fruit-company', the one that used to be a tree 🌳.",
96
+ "You'll get my email by figuring out: my first pet’s name, the city I grew up in, dot org.",
97
+ "My number? It's encrypted as SHA-256. Just decode it and you'll know!",
98
+ "Write to me in the ancient language of the Romans: 'maximus at something_prime dot com'.",
99
+ "Reach out to the winner of 2022's football world cup at 'world champions dot com'.",
100
+ "Find me at the place where the Eiffel Tower stands tall, at dot 'home of baguettes'.",
101
+ "Hit me up via snail mail: Just translate 'rabbit' into Italian and add 'at Italian mail'.",
102
+ "My digits form a prime sequence starting from 11, just keep counting!",
103
+ "For my number, follow the clues hidden in Da Vinci's most famous painting."
104
+
105
+ ]
106
+ import time
107
+ # url = "https://vidhitmakvana1-contact-sharing-recognizer-api.hf.space/batch_detect_contact"
108
+ url = "http://localhost:8000/batch_detect_contact"
109
+
110
+ async def process_batch(session, texts):
111
+ payload = {"texts": texts}
112
+ headers = {"Content-Type": "application/json"}
113
+
114
+ start_time = time.time()
115
+ async with session.post(url, data=json.dumps(payload), headers=headers) as response:
116
+ if response.status == 200:
117
+ results = await response.json()
118
+ end_time = time.time()
119
+ for result in results:
120
+ result['response_time'] = (end_time - start_time) / len(texts)
121
+ return results
122
+ else:
123
+ print(f"Error for batch")
124
+ print(f"Status code: {response.status}")
125
+ print(f"Response: {await response.text()}")
126
+ return None
127
+
128
+ async def main():
129
+ # Inflate test_texts
130
+ inflated_texts = test_texts * 100 # Multiply the test set by 10
131
+
132
+ async with aiohttp.ClientSession() as session:
133
+ batch_size = 1000
134
+ batches = [inflated_texts[i:i + batch_size] for i in range(0, len(inflated_texts), batch_size)]
135
+
136
+ tasks = [process_batch(session, batch) for batch in batches]
137
+ all_results = await tqdm.gather(*tasks)
138
+
139
+ results = [item for sublist in all_results for item in sublist if sublist]
140
+
141
+ correct_predictions = 0
142
+ total_predictions = len(results)
143
+ total_response_time = 0
144
+
145
+ for result in results:
146
+ if result:
147
+ print(f"Text: {result['text']}")
148
+ print(f"Is Contact Info: {result['is_contact_info']}")
149
+ print(f"Method: {result['method']}")
150
+ print(f"Response Time: {result['response_time']:.4f} seconds")
151
+ print("---")
152
+
153
+ # Assuming all texts in test_texts are actually contact information
154
+ if result['is_contact_info']:
155
+ correct_predictions += 1
156
+
157
+ total_response_time += result['response_time']
158
+
159
+ accuracy = correct_predictions / total_predictions
160
+ average_response_time = total_response_time / total_predictions
161
+ print(f"Accuracy: {accuracy:.2f}")
162
+ print(f"Average Response Time: {average_response_time:.4f} seconds")
163
+
164
+ if __name__ == "__main__":
165
+ while True:
166
+ asyncio.run(main())
vocab.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28edf2ae44d144c4566f0e5f95b856391166ac138ee578bac7fd9db151e1790a
3
+ size 5184491