update_ts.py 1.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455
  1. import json
  2. import csv
  3. import pandas as pd
  4. import numpy as np
  5. import warnings
  6. warnings.filterwarnings("ignore")
  7. pd.set_option('display.max_columns', None)
  8. pd.set_option('display.max_rows', None)
  9. pd.set_option('display.width', 1000)
  10. def convert_to_csv(filepath):
  11. """
  12. convert txt file to csv
  13. run only once at the beginning
  14. """
  15. with open(filepath, 'r') as f, open('docs/reddit_100000.csv', 'w', encoding='utf-8') as csvfile:
  16. fieldnames = ['timestamp', 'user_id']
  17. writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
  18. writer.writeheader()
  19. for line in f:
  20. writer.writerow(json.loads(line))
  21. print('Successfully convert to CSV')
  22. def read_data(filepath):
  23. """
  24. read CSV data and parse timestamp
  25. :return: dataframe named df
  26. """
  27. parser = lambda x: pd.to_datetime(x, unit='s')
  28. df = pd.read_csv(filepath, parse_dates=['timestamp'], date_parser=parser)
  29. return df
  30. def fill_ts(df):
  31. """
  32. set values for time series and fill N/A values
  33. :param data: time series dataframe
  34. :return: filled dataframe
  35. """
  36. df['timestamp'] = df['timestamp'].dt.floor('H') # floor timestamp to hour
  37. df_cnt = df.loc[:, "user_id"].value_counts().to_frame().rename_axis('user_id')
  38. df_cnt.columns = ['count']
  39. df.reset_index(drop=True, inplace=True)
  40. df_cnt.reset_index(drop=True, inplace=True)
  41. df = pd.merge(df_cnt, df, left_index=True, right_index=True, how='outer').set_index('user_id')
  42. df_res = df.pivot_table(index='user_id', columns='timestamp', values='count', aggfunc=np.sum)
  43. print(df_res.shape)
  44. return df_res