12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455 |
- import json
- import csv
- import pandas as pd
- import numpy as np
- import warnings
- warnings.filterwarnings("ignore")
- pd.set_option('display.max_columns', None)
- pd.set_option('display.max_rows', None)
- pd.set_option('display.width', 1000)
- def convert_to_csv(filepath):
- """
- convert txt file to csv
- run only once at the beginning
- """
- with open(filepath, 'r') as f, open('docs/reddit_100000.csv', 'w', encoding='utf-8') as csvfile:
- fieldnames = ['timestamp', 'user_id']
- writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
- writer.writeheader()
- for line in f:
- writer.writerow(json.loads(line))
- print('Successfully convert to CSV')
- def read_data(filepath):
- """
- read CSV data and parse timestamp
- :return: dataframe named df
- """
- parser = lambda x: pd.to_datetime(x, unit='s')
- df = pd.read_csv(filepath, parse_dates=['timestamp'], date_parser=parser)
- return df
- def fill_ts(df):
- """
- set values for time series and fill N/A values
- :param data: time series dataframe
- :return: filled dataframe
- """
- df['timestamp'] = df['timestamp'].dt.floor('H') # floor timestamp to hour
- df_cnt = df.loc[:, "user_id"].value_counts().to_frame().rename_axis('user_id')
- df_cnt.columns = ['count']
- df.reset_index(drop=True, inplace=True)
- df_cnt.reset_index(drop=True, inplace=True)
- df = pd.merge(df_cnt, df, left_index=True, right_index=True, how='outer').set_index('user_id')
- df_res = df.pivot_table(index='user_id', columns='timestamp', values='count', aggfunc=np.sum)
- print(df_res.shape)
- return df_res
|