huixijin.cao
/
thesis-TA-resistant-anonymity-set


			
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455
							import json
import csv
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 1000)


def convert_to_csv(filepath):
    """
    convert txt file to csv
    run only once at the beginning
    """
    with open(filepath, 'r') as f, open('docs/reddit_100000.csv', 'w', encoding='utf-8') as csvfile:
        fieldnames = ['timestamp', 'user_id']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for line in f:
            writer.writerow(json.loads(line))
        print('Successfully convert to CSV')


def read_data(filepath):
    """
    read CSV data and parse timestamp
    :return: dataframe named df
    """
    parser = lambda x: pd.to_datetime(x, unit='s')
    df = pd.read_csv(filepath, parse_dates=['timestamp'], date_parser=parser)

    return df

def fill_ts(df):
    """
    set values for time series and fill N/A values
    :param data: time series dataframe
    :return: filled dataframe
    """
    df['timestamp'] = df['timestamp'].dt.floor('H')                     # floor timestamp to hour
    df_cnt = df.loc[:, "user_id"].value_counts().to_frame().rename_axis('user_id')
    df_cnt.columns = ['count']

    df.reset_index(drop=True, inplace=True)
    df_cnt.reset_index(drop=True, inplace=True)
    df = pd.merge(df_cnt, df, left_index=True, right_index=True, how='outer').set_index('user_id')

    df_res = df.pivot_table(index='user_id', columns='timestamp', values='count', aggfunc=np.sum)
    print(df_res.shape)

    return df_res