Commit
•
1d46c26
1
Parent(s):
1b56724
Initializing cols
Browse files- utilities/pushshift_data.py +12 -0
utilities/pushshift_data.py
CHANGED
@@ -146,9 +146,21 @@ def submissions_to_dataframe(submissions: List[Dict[str, Any]]) -> pd.DataFrame:
|
|
146 |
'downs', 'ups']
|
147 |
df = pd.DataFrame(submissions)
|
148 |
df = df.convert_dtypes()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
149 |
df = df[cols]
|
|
|
150 |
# Convert the "created_utc" column to a datetime column with timezone information
|
151 |
df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s').dt.tz_localize('UTC')
|
|
|
|
|
152 |
df['date'] = df['created_utc'].dt.date.astype(str)
|
153 |
df['time'] = df['created_utc'].dt.time.astype(str)
|
154 |
return df
|
|
|
146 |
'downs', 'ups']
|
147 |
df = pd.DataFrame(submissions)
|
148 |
df = df.convert_dtypes()
|
149 |
+
|
150 |
+
# As of Jan 2017 Im getting an error:
|
151 |
+
# KeyError: "['downs', 'ups'] not in index"
|
152 |
+
# To maintain backwards compatibility I will initialize these cols
|
153 |
+
for col in cols:
|
154 |
+
if col not in df.columns:
|
155 |
+
df[col] = None
|
156 |
+
|
157 |
+
# Take the subset of columns
|
158 |
df = df[cols]
|
159 |
+
|
160 |
# Convert the "created_utc" column to a datetime column with timezone information
|
161 |
df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s').dt.tz_localize('UTC')
|
162 |
+
|
163 |
+
# Using native type date and time had some incompatibility with the datasets visualization widget
|
164 |
df['date'] = df['created_utc'].dt.date.astype(str)
|
165 |
df['time'] = df['created_utc'].dt.time.astype(str)
|
166 |
return df
|