orionweller commited on
Commit
0c3e233
1 Parent(s): 0a78f42

Upload find_splitting_words.py

Browse files
Files changed (1) hide show
  1. find_splitting_words.py +95 -0
find_splitting_words.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import pandas as pd
3
+ from nltk.corpus import stopwords
4
+ from nltk.stem import PorterStemmer
5
+ from nltk.tokenize import word_tokenize
6
+ from collections import Counter
7
+ import string
8
+ import os
9
+
10
+ # Ensure you've downloaded the set of stop words the first time you run this
11
+ import nltk
12
+ nltk.download('punkt')
13
+ nltk.download('stopwords')
14
+
15
+ def preprocess_document(doc):
16
+ """
17
+ Tokenizes, removes punctuation, stopwords, and stems words in a single document.
18
+ """
19
+ # Lowercase
20
+ doc = doc.lower()
21
+ # Remove punctuation
22
+ doc = doc.translate(str.maketrans('', '', string.punctuation))
23
+ # Tokenize
24
+ tokens = word_tokenize(doc)
25
+ # Remove stop words
26
+ stop_words = set(stopwords.words('english'))
27
+ filtered_tokens = [word for word in tokens if word not in stop_words]
28
+ # Stemming
29
+ stemmer = PorterStemmer()
30
+ stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
31
+ return stemmed_tokens
32
+
33
+ def find_dividing_words(documents):
34
+ """
35
+ Identifies candidate words that might split the set of documents into two groups.
36
+ """
37
+ all_words = []
38
+ per_doc_word_counts = []
39
+
40
+ for doc in documents:
41
+ preprocessed_doc = preprocess_document(doc)
42
+ all_words.extend(preprocessed_doc)
43
+ per_doc_word_counts.append(Counter(preprocessed_doc))
44
+
45
+ # Overall word frequency
46
+ overall_word_counts = Counter(all_words)
47
+
48
+ # Find words that appear in roughly half the documents
49
+ num_docs = len(documents)
50
+ candidate_words = []
51
+ for word, count in overall_word_counts.items():
52
+ doc_frequency = sum(1 for doc_count in per_doc_word_counts if doc_count[word] > 0)
53
+ if 0.3 * num_docs <= doc_frequency <= 0.7 * num_docs:
54
+ candidate_words.append(word)
55
+
56
+ return candidate_words
57
+
58
+
59
+ def make_contents(doc):
60
+ """
61
+ Returns the contents of a document as a single string.
62
+ """
63
+ if "title" in doc and "contents" in doc:
64
+ return doc["title"] + " " + doc["contents"]
65
+ if "headline" in doc and "text" in doc:
66
+ return doc["headline"] + " " + doc["text"]
67
+ if "title" in doc and "text" in doc:
68
+ return doc["title"] + " " + doc["text"]
69
+ if "contents" in doc:
70
+ return doc["contents"]
71
+ if "text" in doc:
72
+ return doc["text"]
73
+
74
+
75
+ def main(args):
76
+ # read in the qrels and docs file from the `args.dataset` directory for the `.relevant_only` files
77
+ base_dir = os.path.join("data", args.dataset)
78
+ qrels = pd.read_csv(os.path.join(base_dir, "qrels.relevant_only.trec"), sep="\t", header=None, names=["qid", "docid", "rel"])
79
+ docs = pd.read_json(os.path.join(base_dir, "docs.relevant_only.jsonl"), lines=True)
80
+
81
+ for qid in qrels.groupby("qid").groups.keys():
82
+ # get the relevant documents for the current query
83
+ relevant_docids = qrels[qrels["qid"] == qid]["docid"].tolist()
84
+ # get the text for the relevant documents
85
+ relevant_docs_text = docs[docs["doc_id"].isin(relevant_docids)].apply(lambda x: make_contents(x), axis=1).tolist()
86
+ splitting_words = find_dividing_words(relevant_docs_text)
87
+
88
+ breakpoint()
89
+
90
+
91
+ if __name__ == "__main__":
92
+ parser = argparse.ArgumentParser(description='Find words that might split the set of documents into two groups.')
93
+ parser.add_argument('dataset', type=str, help='The dataset to use (e.g. "robust04")')
94
+ args = parser.parse_args()
95
+ main(args)