LT 13 | Basak, Chua, Danao, Roberto
import numpy as np
import dask as da
import dask.dataframe as dd
import matplotlib.pyplot as plt
import datetime
import pandas as pd
from dask.diagnostics import ProgressBar
pbar = ProgressBar()
pbar.register()
from dask.distributed import Client
Client()
df = dd.read_csv('s3://gdelt-open-data/events/201*.export.csv',
delimiter = '\t', assume_missing = True,
storage_options={'anon':True}, header=None, dtype= {n:'object' for n in range(0,59)})
ids = pd.read_excel('fieldids.xlsx', sheet_name ='Sheet1')['Field Name'].values
df.columns = ids
len(df.columns)
df = df[(df['ActionGeo_CountryCode']=='US') &
(df['SQLDATE'].str.startswith('2016'))][['SQLDATE','MonthYear','Year',
'Actor1Code', 'Actor1Name',
'Actor1CountryCode',
'Actor2Code',
'Actor2Name',
'Actor2CountryCode',
'NumMentions', 'NumSources',
'NumArticles','AvgTone',
'GoldsteinScale',
'SOURCEURL']]
df['NumMentions']=df['NumMentions'].astype(str).astype(int)
df['NumSources']=df['NumSources'].astype(str).astype(int)
df['NumArticles']=df['NumArticles'].astype(str).astype(int)
df['AvgTone']=df['AvgTone'].astype(str).astype(float)
df['GoldsteinScale']=df['GoldsteinScale'].astype(str).astype(float)
def time_serieser(df, name, cols=['NumMentions']):
df['NumMentions'] = df['NumMentions'].astype(str).astype(int)
df['NumSources'] = df['NumSources'].astype(str).astype(int)
df['NumArticles'] = df['NumArticles'].astype(str).astype(int)
df['AvgTone'] = df['AvgTone'].astype(str).astype(float)
df['GoldsteinScale'] = df['GoldsteinScale'].astype(str).astype(float)
df_segment = df[(df['SOURCEURL'].str.contains(name.split('-')[0].lower())) |
(df['SOURCEURL'].str.contains(name.split('-')[1].lower())) |
(df['SOURCEURL'].str.contains(name.lower()))][['SQLDATE',
'Actor1Name',
'Actor2Name', 'NumMentions',
'NumSources', 'NumArticles',
'AvgTone', 'GoldsteinScale']]
if 'MentTone' in cols:
df_segment['MentTone'] = (
df_segment['NumSources']*df_segment['AvgTone'])
for col in cols:
results = (pd.Series(df_segment.groupby(
'SQLDATE').sum()[col].compute()))
return results
hc = time_serieser(df, name = 'hillary-clinton', cols = ['NumMentions'])
with open('clinton_n_mentions.pkl','wb') as f:
pickle.dump(hc,f)
dt = time_serieser(df, name = 'donald-trump', cols = ['NumMentions'])
with open('trump_n_mentions.pkl','wb') as f:
pickle.dump(dt,f)
js = time_serieser(df, name = 'jill-stein', cols = ['NumMentions'])
with open('stein_n_mentions.pkl','wb') as f:
pickle.dump(js,f)
gj = time_serieser(df, name = 'gary-johnson', cols = ['NumMentions'])
import pickle
with open('johnson_n_mentions.pkl','wb') as f:
pickle.dump(gj,f)
hc = time_serieser(df, name = 'hillary-clinton', cols = ['NumSources'])
import pickle
with open('clinton_n_sources.pkl','wb') as f:
pickle.dump(hc,f)
dt = time_serieser(df, name = 'donald-trump', cols = ['NumSources'])
with open('trump_n_sources.pkl','wb') as f:
pickle.dump(dt,f)
js = time_serieser(df, name = 'jill-stein', cols = ['NumSources'])
with open('stein_n_sources.pkl','wb') as f:
pickle.dump(js,f)
gj = time_serieser(df, name = 'gary-johnson', cols = ['NumSources'])
with open('johnson_n_sources.pkl','wb') as f:
pickle.dump(gj,f)
hc = time_serieser(df, name = 'hillary-clinton', cols = ['NumArticles'])
import pickle
with open('clinton_n_articles.pkl','wb') as f:
pickle.dump(hc,f)
dt = time_serieser(df, name = 'donald-trump', cols = ['NumArticles'])
with open('trump_n_articles.pkl','wb') as f:
pickle.dump(dt,f)
js = time_serieser(df, name = 'jill-stein', cols = ['NumArticles'])
with open('stein_n_articles.pkl','wb') as f:
pickle.dump(js,f)
gj = time_serieser(df, name = 'gary-johnson', cols = ['NumArticles'])
with open('johnson_n_articles.pkl','wb') as f:
pickle.dump(gj,f)
hc = time_serieser(df, name = 'hillary-clinton', cols = ['MentTone'])
import pickle
with open('clinton_n_menttone.pkl','wb') as f:
pickle.dump(hc,f)
dt = time_serieser(df, name = 'donald-trump', cols = ['MentTone'])
with open('trump_n_menttone.pkl','wb') as f:
pickle.dump(dt,f)
js = time_serieser(df, name = 'jill-stein', cols = ['MentTone'])
with open('stein_n_menttone.pkl','wb') as f:
pickle.dump(js,f)
gj = time_serieser(df, name = 'gary-johnson', cols = ['MentTone'])
hc = time_serieser(df, name = 'hillary-clinton', cols = ['MentTone'])
import pickle
with open('clinton_n_menttone.pkl','wb') as f:
pickle.dump(hc,f)
dt = time_serieser(df, name = 'donald-trump', cols = ['MentTone'])
with open('trump_n_menttone.pkl','wb') as f:
pickle.dump(dt,f)
js = time_serieser(df, name = 'jill-stein', cols = ['MentTone'])
with open('stein_n_menttone.pkl','wb') as f:
pickle.dump(js,f)
gj = time_serieser(df, name = 'gary-johnson', cols = ['MentTone'])
with open('johnson_n_menttone.pkl','wb') as f:
pickle.dump(gj,f)
with open('johnson_n_menttone.pkl','wb') as f:
pickle.dump(gj,f)
hc = time_serieser(df, name = 'hillary-clinton', cols = ['MentTone'])
import pickle
with open('clinton_n_menttone.pkl','wb') as f:
pickle.dump(hc,f)
dt = time_serieser(df, name = 'donald-trump', cols = ['MentTone'])
with open('trump_n_menttone.pkl','wb') as f:
pickle.dump(dt,f)
js = time_serieser(df, name = 'jill-stein', cols = ['MentTone'])
with open('stein_n_menttone.pkl','wb') as f:
pickle.dump(js,f)
gj = time_serieser(df, name = 'gary-johnson', cols = ['MentTone'])
hc = time_serieser(df, name = 'hillary-clinton', cols = ['AvgTone'])
import pickle
with open('clinton_n_menttone.pkl','wb') as f:
pickle.dump(hc,f)
dt = time_serieser(df, name = 'donald-trump', cols = ['MentTone'])
with open('trump_n_menttone.pkl','wb') as f:
pickle.dump(dt,f)
js = time_serieser(df, name = 'jill-stein', cols = ['MentTone'])
with open('stein_n_menttone.pkl','wb') as f:
pickle.dump(js,f)
gj = time_serieser(df, name = 'gary-johnson', cols = ['MentTone'])
with open('johnson_n_menttone.pkl','wb') as f:
pickle.dump(gj,f)
with open('johnson_n_menttone.pkl','wb') as f:
pickle.dump(gj,f)
hc = time_serieser(df, name = 'hillary-clinton', cols = ['ArtiTone'])
import pickle
with open('clinton_n_artitone.pkl','wb') as f:
pickle.dump(hc,f)
dt = time_serieser(df, name = 'donald-trump', cols = ['ArtiTone'])
with open('trump_n_artitone.pkl','wb') as f:
pickle.dump(dt,f)
js = time_serieser(df, name = 'jill-stein', cols = ['ArtiTone'])
with open('stein_n_artitone.pkl','wb') as f:
pickle.dump(js,f)
gj = time_serieser(df, name = 'gary-johnson', cols = ['ArtiTone'])
import pickle
with open('johnson_n_artitone.pkl','wb') as f:
pickle.dump(gj,f)
hc = time_serieser(df, name = 'hillary-clinton', cols = ['SourceTone'])
import pickle
with open('clinton_n_sourcetone.pkl','wb') as f:
pickle.dump(hc,f)
dt = time_serieser(df, name = 'donald-trump', cols = ['SourceTone'])
with open('trump_n_sourcetone.pkl','wb') as f:
pickle.dump(dt,f)
js = time_serieser(df, name = 'jill-stein', cols = ['SourceTone'])
with open('stein_n_sourcetone.pkl','wb') as f:
pickle.dump(js,f)
gj = time_serieser(df, name = 'gary-johnson', cols = ['SourceTone'])
import pickle
with open('johnson_n_sourcetone.pkl','wb') as f:
pickle.dump(gj,f)
hc = time_serieser(df, name = 'hillary-clinton', cols = ['GoldsteinScale'])
import pickle
with open('clinton_n_goldstein.pkl','wb') as f:
pickle.dump(hc,f)
dt = time_serieser(df, name = 'donald-trump', cols = ['GoldsteinScale'])
with open('trump_n_goldstein.pkl','wb') as f:
pickle.dump(dt,f)
js = time_serieser(df, name = 'jill-stein', cols = ['GoldsteinScale'])
with open('stein_n_goldstein.pkl','wb') as f:
pickle.dump(js,f)
gj = time_serieser(df, name = 'gary-johnson', cols = ['GoldsteinScale'])
import pickle
with open('johnson_n_goldstein.pkl','wb') as f:
pickle.dump(gj,f)