Spaces:
Sleeping
Sleeping
File size: 3,645 Bytes
0c3e233 0bc0c39 0c3e233 0bc0c39 0c3e233 0bc0c39 0c3e233 0bc0c39 0c3e233 0bc0c39 0c3e233 0bc0c39 0c3e233 0bc0c39 0c3e233 0bc0c39 0c3e233 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 |
import argparse
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from collections import Counter
import string
import os
import streamlit as st
# Ensure you've downloaded the set of stop words the first time you run this
import nltk
# only download if they don't exist
if not os.path.exists(os.path.join(nltk.data.find('corpora'), 'stopwords')):
nltk.download('punkt')
nltk.download('stopwords')
def preprocess_document(doc):
"""
Tokenizes, removes punctuation, stopwords, and stems words in a single document.
"""
# Lowercase
doc = doc.lower()
# Remove punctuation
doc = doc.translate(str.maketrans('', '', string.punctuation))
# Tokenize
tokens = word_tokenize(doc)
# Remove stop words
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word not in stop_words]
# Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
return stemmed_tokens
@st.cache_data
def find_dividing_words(documents):
"""
Identifies candidate words that might split the set of documents into two groups.
"""
all_words = []
per_doc_word_counts = []
i = 0
for doc in documents:
print(i)
preprocessed_doc = preprocess_document(doc)
all_words.extend(preprocessed_doc)
per_doc_word_counts.append(Counter(preprocessed_doc))
i += 1
# Overall word frequency
overall_word_counts = Counter(all_words)
# Find words that appear in roughly half the documents
num_docs = len(documents)
candidate_words = []
for word, count in overall_word_counts.items():
doc_frequency = sum(1 for doc_count in per_doc_word_counts if doc_count[word] > 0)
if 0.35 * num_docs <= doc_frequency <= 0.75 * num_docs:
candidate_words.append(word)
print("Done with dividing words")
return candidate_words
def make_contents(doc):
"""
Returns the contents of a document as a single string.
"""
if "title" in doc and "contents" in doc:
return doc["title"] + " " + doc["contents"]
if "headline" in doc and "text" in doc:
return doc["headline"] + " " + doc["text"]
if "title" in doc and "text" in doc:
return doc["title"] + " " + doc["text"]
if "contents" in doc:
return doc["contents"]
if "text" in doc:
return doc["text"]
def main(args):
# read in the qrels and docs file from the `args.dataset` directory for the `.relevant_only` files
base_dir = os.path.join("data", args.dataset)
qrels = pd.read_csv(os.path.join(base_dir, "qrels.relevant_only.trec"), sep="\t", header=None, names=["qid", "docid", "rel"])
docs = pd.read_json(os.path.join(base_dir, "docs.relevant_only.jsonl"), lines=True)
for qid in qrels.groupby("qid").groups.keys():
# get the relevant documents for the current query
relevant_docids = qrels[qrels["qid"] == qid]["docid"].tolist()
# get the text for the relevant documents
relevant_docs_text = docs[docs["doc_id"].isin(relevant_docids)].apply(lambda x: make_contents(x), axis=1).tolist()
splitting_words = find_dividing_words(relevant_docs_text)
breakpoint()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Find words that might split the set of documents into two groups.')
parser.add_argument('dataset', type=str, help='The dataset to use (e.g. "robust04")')
args = parser.parse_args()
main(args) |