derek-thomas HF staff commited on
Commit
d1cc79d
1 Parent(s): d0c9304

Deleting old files

Browse files
Files changed (1) hide show
  1. utilities/pushshift_data.py +0 -175
utilities/pushshift_data.py DELETED
@@ -1,175 +0,0 @@
1
- import time
2
- from datetime import datetime, timedelta, timezone
3
- from typing import Any, Dict, List, Optional
4
-
5
- import pandas as pd
6
- import requests
7
-
8
- from my_logger import setup_logger
9
-
10
- logger = setup_logger(__name__)
11
-
12
-
13
- def get_pushshift_data(subreddit: str, before: Optional[int] = None,
14
- after: Optional[int] = None, aggs: Optional[str] = None) -> Optional[Dict[str, Any]]:
15
- """
16
- Fetch data from the Pushshift API for the specified subreddit.
17
-
18
- :param subreddit: The name of the subreddit to scrape.
19
- :param before: The upper limit for the created_utc attribute of the submissions.
20
- :param after: The lower limit for the created_utc attribute of the submissions.
21
- :param aggs: The aggregation summary option to use.
22
- :return: A dictionary containing the fetched data and aggregations if available.
23
- """
24
- url = "https://api.pushshift.io/reddit/search/submission/"
25
- params = {
26
- "subreddit": subreddit,
27
- "size": 1000,
28
- "sort": "created_utc",
29
- "sort_type": "desc",
30
- }
31
- if before is not None:
32
- params["before"] = before
33
- if after is not None:
34
- params["after"] = after
35
- if aggs is not None:
36
- params["aggs"] = aggs
37
-
38
- response = requests.get(url, params=params)
39
- if response.status_code == 200:
40
- return response.json()
41
- else:
42
- logger.error(f"Error fetching data: {response.status_code}")
43
- return None
44
-
45
-
46
- def get_post_count_for_day(subreddit: str, day_to_scrape: str) -> int:
47
- """
48
- Get the total number of posts for a specific day in the specified subreddit using the Pushshift API.
49
-
50
- :param subreddit: The name of the subreddit to get the post count for.
51
- :param day_to_scrape: The date for which to get the post count (format: "YYYY-MM-DD").
52
- :return: The total number of posts for the specified day.
53
- """
54
- date_obj = datetime.strptime(day_to_scrape, "%Y-%m-%d")
55
- after = int(date_obj.timestamp())
56
- before = int((date_obj + timedelta(days=1)).timestamp())
57
-
58
- response = get_pushshift_data(subreddit, before=before, after=after, aggs="created_utc")
59
- if response is not None:
60
- aggs = response.get("aggs", {}).get("created_utc", [])
61
- if aggs:
62
- return aggs[0]["doc_count"]
63
- return 0
64
-
65
-
66
- def fetch_data(subreddit: str, before: int, after: int) -> Optional[Dict[str, Any]]:
67
- url = "https://api.pushshift.io/reddit/search/submission/"
68
- params = {
69
- "subreddit": subreddit,
70
- "size": 1000,
71
- "sort": "created_utc",
72
- "sort_type": "desc",
73
- "before": before,
74
- "after": after,
75
- }
76
-
77
- response = requests.get(url, params=params)
78
- if response.status_code == 200:
79
- return response.json()
80
- else:
81
- logger.error(f"Error fetching data: {response.status_code}")
82
- return None
83
-
84
-
85
- def convert_timestamp_to_datetime(timestamp: int) -> str:
86
- # Convert the timestamp to a datetime object
87
- datetime_obj = datetime.utcfromtimestamp(timestamp)
88
-
89
- # Add timezone information
90
- datetime_obj_utc = datetime_obj.replace(tzinfo=timezone.utc)
91
-
92
- # Convert the datetime object to a formatted string
93
- datetime_str = datetime_obj_utc.strftime('%Y-%m-%d %H:%M:%S')
94
-
95
- return datetime_str
96
-
97
-
98
- def scrape_submissions_by_day(subreddit_to_scrape: str, day_to_scrape: str) -> List[Dict[str, Any]]:
99
- start_time = time.time()
100
- scraped_submissions = []
101
- date_obj = datetime.strptime(day_to_scrape, "%Y-%m-%d")
102
-
103
- if date_obj > datetime.now() - timedelta(days=7):
104
- logger.error("The specified date might not be available in the Pushshift API yet. "
105
- "Please try an earlier date or wait for the API to be updated.")
106
- return scraped_submissions
107
-
108
- after = int(date_obj.timestamp())
109
- before = int((date_obj + timedelta(days=1)).timestamp())
110
-
111
- # todo get_post_count_for_day didnt seem to work
112
- # post_count = get_post_count_for_day(subreddit_to_scrape, day_to_scrape)
113
- # total_requests = (post_count + 99) // 100 # Estimate the total number of requests
114
-
115
- actual_requests = 0
116
- while after < before:
117
- after_str, before_str = convert_timestamp_to_datetime(after), convert_timestamp_to_datetime(before)
118
- logger.debug(f"Fetching data between timestamps {after_str} and {before_str}")
119
- data = get_pushshift_data(subreddit_to_scrape, before=before, after=after)
120
- if data is None or len(data["data"]) == 0:
121
- break
122
-
123
- scraped_submissions.extend(data["data"])
124
- before = data["data"][-1]["created_utc"]
125
-
126
- actual_requests += 1
127
- time.sleep(1)
128
-
129
- elapsed_time = time.time() - start_time
130
- if actual_requests:
131
- logger.info(
132
- f"{actual_requests}it [{elapsed_time // 60:02}:{elapsed_time % 60:.2f} {elapsed_time / actual_requests:.2f}s/it]")
133
- logger.info(
134
- f"Finished scraping {len(scraped_submissions)} submissions in {elapsed_time:.2f} seconds in {actual_requests} requests")
135
- return scraped_submissions
136
-
137
-
138
- def submissions_to_dataframe(submissions: List[Dict[str, Any]]) -> pd.DataFrame:
139
- """
140
- Parse a list of submissions into a pandas DataFrame.
141
-
142
- :param submissions: A list of dictionaries containing the scraped submission data.
143
- :return: A pandas DataFrame containing the submission data.
144
- """
145
- cols = ['score', 'num_comments', 'title', 'permalink', 'selftext', 'url', 'created_utc', 'author', 'id',
146
- 'downs', 'ups']
147
- df = pd.DataFrame(submissions)
148
- df = df.convert_dtypes()
149
-
150
- # As of Jan 2017 Im getting an error:
151
- # KeyError: "['downs', 'ups'] not in index"
152
- # To maintain backwards compatibility I will initialize these cols
153
- for col in cols:
154
- if col not in df.columns:
155
- df[col] = None
156
-
157
- # Take the subset of columns
158
- df = df[cols]
159
-
160
- # Convert the "created_utc" column to a datetime column with timezone information
161
- df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s').dt.tz_localize('UTC')
162
-
163
- # Using native type date and time had some incompatibility with the datasets visualization widget
164
- df['date'] = df['created_utc'].dt.date.astype(str)
165
- df['time'] = df['created_utc'].dt.time.astype(str)
166
- return df
167
-
168
-
169
- if __name__ == '__main__':
170
- subreddit_to_scrape = "askreddit"
171
- day_to_scrape = "2013-03-01"
172
- submissions = scrape_submissions_by_day(subreddit_to_scrape, day_to_scrape)
173
- df = submissions_to_dataframe(submissions)
174
- print(df.head().to_string())
175
- logger.info(f"Scraped {len(submissions)} submissions from r/{subreddit_to_scrape} on {day_to_scrape}")