Commit
•
d1cc79d
1
Parent(s):
d0c9304
Deleting old files
Browse files- utilities/pushshift_data.py +0 -175
utilities/pushshift_data.py
DELETED
@@ -1,175 +0,0 @@
|
|
1 |
-
import time
|
2 |
-
from datetime import datetime, timedelta, timezone
|
3 |
-
from typing import Any, Dict, List, Optional
|
4 |
-
|
5 |
-
import pandas as pd
|
6 |
-
import requests
|
7 |
-
|
8 |
-
from my_logger import setup_logger
|
9 |
-
|
10 |
-
logger = setup_logger(__name__)
|
11 |
-
|
12 |
-
|
13 |
-
def get_pushshift_data(subreddit: str, before: Optional[int] = None,
|
14 |
-
after: Optional[int] = None, aggs: Optional[str] = None) -> Optional[Dict[str, Any]]:
|
15 |
-
"""
|
16 |
-
Fetch data from the Pushshift API for the specified subreddit.
|
17 |
-
|
18 |
-
:param subreddit: The name of the subreddit to scrape.
|
19 |
-
:param before: The upper limit for the created_utc attribute of the submissions.
|
20 |
-
:param after: The lower limit for the created_utc attribute of the submissions.
|
21 |
-
:param aggs: The aggregation summary option to use.
|
22 |
-
:return: A dictionary containing the fetched data and aggregations if available.
|
23 |
-
"""
|
24 |
-
url = "https://api.pushshift.io/reddit/search/submission/"
|
25 |
-
params = {
|
26 |
-
"subreddit": subreddit,
|
27 |
-
"size": 1000,
|
28 |
-
"sort": "created_utc",
|
29 |
-
"sort_type": "desc",
|
30 |
-
}
|
31 |
-
if before is not None:
|
32 |
-
params["before"] = before
|
33 |
-
if after is not None:
|
34 |
-
params["after"] = after
|
35 |
-
if aggs is not None:
|
36 |
-
params["aggs"] = aggs
|
37 |
-
|
38 |
-
response = requests.get(url, params=params)
|
39 |
-
if response.status_code == 200:
|
40 |
-
return response.json()
|
41 |
-
else:
|
42 |
-
logger.error(f"Error fetching data: {response.status_code}")
|
43 |
-
return None
|
44 |
-
|
45 |
-
|
46 |
-
def get_post_count_for_day(subreddit: str, day_to_scrape: str) -> int:
|
47 |
-
"""
|
48 |
-
Get the total number of posts for a specific day in the specified subreddit using the Pushshift API.
|
49 |
-
|
50 |
-
:param subreddit: The name of the subreddit to get the post count for.
|
51 |
-
:param day_to_scrape: The date for which to get the post count (format: "YYYY-MM-DD").
|
52 |
-
:return: The total number of posts for the specified day.
|
53 |
-
"""
|
54 |
-
date_obj = datetime.strptime(day_to_scrape, "%Y-%m-%d")
|
55 |
-
after = int(date_obj.timestamp())
|
56 |
-
before = int((date_obj + timedelta(days=1)).timestamp())
|
57 |
-
|
58 |
-
response = get_pushshift_data(subreddit, before=before, after=after, aggs="created_utc")
|
59 |
-
if response is not None:
|
60 |
-
aggs = response.get("aggs", {}).get("created_utc", [])
|
61 |
-
if aggs:
|
62 |
-
return aggs[0]["doc_count"]
|
63 |
-
return 0
|
64 |
-
|
65 |
-
|
66 |
-
def fetch_data(subreddit: str, before: int, after: int) -> Optional[Dict[str, Any]]:
|
67 |
-
url = "https://api.pushshift.io/reddit/search/submission/"
|
68 |
-
params = {
|
69 |
-
"subreddit": subreddit,
|
70 |
-
"size": 1000,
|
71 |
-
"sort": "created_utc",
|
72 |
-
"sort_type": "desc",
|
73 |
-
"before": before,
|
74 |
-
"after": after,
|
75 |
-
}
|
76 |
-
|
77 |
-
response = requests.get(url, params=params)
|
78 |
-
if response.status_code == 200:
|
79 |
-
return response.json()
|
80 |
-
else:
|
81 |
-
logger.error(f"Error fetching data: {response.status_code}")
|
82 |
-
return None
|
83 |
-
|
84 |
-
|
85 |
-
def convert_timestamp_to_datetime(timestamp: int) -> str:
|
86 |
-
# Convert the timestamp to a datetime object
|
87 |
-
datetime_obj = datetime.utcfromtimestamp(timestamp)
|
88 |
-
|
89 |
-
# Add timezone information
|
90 |
-
datetime_obj_utc = datetime_obj.replace(tzinfo=timezone.utc)
|
91 |
-
|
92 |
-
# Convert the datetime object to a formatted string
|
93 |
-
datetime_str = datetime_obj_utc.strftime('%Y-%m-%d %H:%M:%S')
|
94 |
-
|
95 |
-
return datetime_str
|
96 |
-
|
97 |
-
|
98 |
-
def scrape_submissions_by_day(subreddit_to_scrape: str, day_to_scrape: str) -> List[Dict[str, Any]]:
|
99 |
-
start_time = time.time()
|
100 |
-
scraped_submissions = []
|
101 |
-
date_obj = datetime.strptime(day_to_scrape, "%Y-%m-%d")
|
102 |
-
|
103 |
-
if date_obj > datetime.now() - timedelta(days=7):
|
104 |
-
logger.error("The specified date might not be available in the Pushshift API yet. "
|
105 |
-
"Please try an earlier date or wait for the API to be updated.")
|
106 |
-
return scraped_submissions
|
107 |
-
|
108 |
-
after = int(date_obj.timestamp())
|
109 |
-
before = int((date_obj + timedelta(days=1)).timestamp())
|
110 |
-
|
111 |
-
# todo get_post_count_for_day didnt seem to work
|
112 |
-
# post_count = get_post_count_for_day(subreddit_to_scrape, day_to_scrape)
|
113 |
-
# total_requests = (post_count + 99) // 100 # Estimate the total number of requests
|
114 |
-
|
115 |
-
actual_requests = 0
|
116 |
-
while after < before:
|
117 |
-
after_str, before_str = convert_timestamp_to_datetime(after), convert_timestamp_to_datetime(before)
|
118 |
-
logger.debug(f"Fetching data between timestamps {after_str} and {before_str}")
|
119 |
-
data = get_pushshift_data(subreddit_to_scrape, before=before, after=after)
|
120 |
-
if data is None or len(data["data"]) == 0:
|
121 |
-
break
|
122 |
-
|
123 |
-
scraped_submissions.extend(data["data"])
|
124 |
-
before = data["data"][-1]["created_utc"]
|
125 |
-
|
126 |
-
actual_requests += 1
|
127 |
-
time.sleep(1)
|
128 |
-
|
129 |
-
elapsed_time = time.time() - start_time
|
130 |
-
if actual_requests:
|
131 |
-
logger.info(
|
132 |
-
f"{actual_requests}it [{elapsed_time // 60:02}:{elapsed_time % 60:.2f} {elapsed_time / actual_requests:.2f}s/it]")
|
133 |
-
logger.info(
|
134 |
-
f"Finished scraping {len(scraped_submissions)} submissions in {elapsed_time:.2f} seconds in {actual_requests} requests")
|
135 |
-
return scraped_submissions
|
136 |
-
|
137 |
-
|
138 |
-
def submissions_to_dataframe(submissions: List[Dict[str, Any]]) -> pd.DataFrame:
|
139 |
-
"""
|
140 |
-
Parse a list of submissions into a pandas DataFrame.
|
141 |
-
|
142 |
-
:param submissions: A list of dictionaries containing the scraped submission data.
|
143 |
-
:return: A pandas DataFrame containing the submission data.
|
144 |
-
"""
|
145 |
-
cols = ['score', 'num_comments', 'title', 'permalink', 'selftext', 'url', 'created_utc', 'author', 'id',
|
146 |
-
'downs', 'ups']
|
147 |
-
df = pd.DataFrame(submissions)
|
148 |
-
df = df.convert_dtypes()
|
149 |
-
|
150 |
-
# As of Jan 2017 Im getting an error:
|
151 |
-
# KeyError: "['downs', 'ups'] not in index"
|
152 |
-
# To maintain backwards compatibility I will initialize these cols
|
153 |
-
for col in cols:
|
154 |
-
if col not in df.columns:
|
155 |
-
df[col] = None
|
156 |
-
|
157 |
-
# Take the subset of columns
|
158 |
-
df = df[cols]
|
159 |
-
|
160 |
-
# Convert the "created_utc" column to a datetime column with timezone information
|
161 |
-
df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s').dt.tz_localize('UTC')
|
162 |
-
|
163 |
-
# Using native type date and time had some incompatibility with the datasets visualization widget
|
164 |
-
df['date'] = df['created_utc'].dt.date.astype(str)
|
165 |
-
df['time'] = df['created_utc'].dt.time.astype(str)
|
166 |
-
return df
|
167 |
-
|
168 |
-
|
169 |
-
if __name__ == '__main__':
|
170 |
-
subreddit_to_scrape = "askreddit"
|
171 |
-
day_to_scrape = "2013-03-01"
|
172 |
-
submissions = scrape_submissions_by_day(subreddit_to_scrape, day_to_scrape)
|
173 |
-
df = submissions_to_dataframe(submissions)
|
174 |
-
print(df.head().to_string())
|
175 |
-
logger.info(f"Scraped {len(submissions)} submissions from r/{subreddit_to_scrape} on {day_to_scrape}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|