import json import csv import pandas as pd import numpy as np import warnings warnings.filterwarnings("ignore") pd.set_option('display.max_columns', None) pd.set_option('display.max_rows', None) pd.set_option('display.width', 1000) def convert_to_csv(filepath): """ convert txt file to csv run only once at the beginning """ with open(filepath, 'r') as f, open('docs/reddit_100000.csv', 'w', encoding='utf-8') as csvfile: fieldnames = ['timestamp', 'user_id'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for line in f: writer.writerow(json.loads(line)) print('Successfully convert to CSV') def read_data(filepath): """ read CSV data and parse timestamp :return: dataframe named df """ parser = lambda x: pd.to_datetime(x, unit='s') df = pd.read_csv(filepath, parse_dates=['timestamp'], date_parser=parser) return df def fill_ts(df): """ set values for time series and fill N/A values :param data: time series dataframe :return: filled dataframe """ df['timestamp'] = df['timestamp'].dt.floor('H') # floor timestamp to hour df_cnt = df.loc[:, "user_id"].value_counts().to_frame().rename_axis('user_id') df_cnt.columns = ['count'] df.reset_index(drop=True, inplace=True) df_cnt.reset_index(drop=True, inplace=True) df = pd.merge(df_cnt, df, left_index=True, right_index=True, how='outer').set_index('user_id') df_res = df.pivot_table(index='user_id', columns='timestamp', values='count', aggfunc=np.sum) print(df_res.shape) return df_res