import pandas as pd from psaw import PushshiftAPI import datetime as dt import time
api = PushshiftAPI() start_epoch=int(dt.datetime(2019, 1, 1).timestamp()) end_epoch=int(dt.datetime(2019, 6, 1).timestamp())
first = 1 filepath = r'D:\Reddit\funny_2019_3.csv'
subreddit_name = ['funny'] cache = [] max_response_cache = 1000000
try: for name in subreddit_name: gen = api.search_comments(subreddit=name, filter=['author', 'created_utc', 'subreddit', 'body', 'id','parent_id', 'score', 'author_flair_css_class', 'author_flair_text', 'metadata'], after = start_epoch, before = end_epoch)
for c in gen:
cache.append(c)
if len(cache) >= max_response_cache:
print("writing...")
df = pd.DataFrame(cache)
df = df.drop(columns="d_")
df = df.drop(columns="created")
if first == 1:
df.to_csv(filepath, mode = 'a', encoding='utf-8-sig', index = False, header=True)
first = 0
if first == 0:
df.to_csv(filepath, mode = 'a', encoding='utf-8-sig', index = False, header=False)
cache = []
df = pd.DataFrame(cache)
df = df.drop(columns="d_")
df = df.drop(columns="created")
if first == 1:
df.to_csv(filepath, mode = 'a', encoding='utf-8-sig', index = False, header=True)
first = 0
if first == 0:
df.to_csv(filepath, mode = 'a', encoding='utf-8-sig', index = False, header=False)
cache = []
except ConnectionAbortedError: sleep(20) print("ConnectionAbortedError occurred")
except ConnectionResetError: sleep(60) print("ConnectionResetError occurred")
except: print("other exception occurred") sleep(60)