In The News: Predicting Winning Political Candidates From News Tone

LT 13 | Basak, Chua, Danao, Roberto

Data Wrangling

In [4]:
import numpy as np
import dask as da
import dask.dataframe as dd
import matplotlib.pyplot as plt
import datetime
import pandas as pd

from dask.diagnostics import ProgressBar
pbar = ProgressBar()
pbar.register()
In [2]:
from dask.distributed import Client
In [3]:
Client()
Out[3]:

Client

Cluster

  • Workers: 8
  • Cores: 32
  • Memory: 270.13 GB
In [4]:
df = dd.read_csv('s3://gdelt-open-data/events/201*.export.csv', 
                    delimiter = '\t', assume_missing = True,
                    storage_options={'anon':True}, header=None, dtype= {n:'object' for n in range(0,59)})
In [5]:
ids = pd.read_excel('fieldids.xlsx', sheet_name ='Sheet1')['Field Name'].values
In [6]:
df.columns = ids
In [7]:
len(df.columns)
Out[7]:
58
In [8]:
df = df[(df['ActionGeo_CountryCode']=='US') &
        (df['SQLDATE'].str.startswith('2016'))][['SQLDATE','MonthYear','Year',
                                                 'Actor1Code', 'Actor1Name',
                                                 'Actor1CountryCode',
                                                 'Actor2Code',
                                                 'Actor2Name',
                                                 'Actor2CountryCode', 
                                                 'NumMentions', 'NumSources',
                                                 'NumArticles','AvgTone',
                                                 'GoldsteinScale',
                                                 'SOURCEURL']]
In [10]:
df['NumMentions']=df['NumMentions'].astype(str).astype(int)
df['NumSources']=df['NumSources'].astype(str).astype(int)
df['NumArticles']=df['NumArticles'].astype(str).astype(int)
df['AvgTone']=df['AvgTone'].astype(str).astype(float)
df['GoldsteinScale']=df['GoldsteinScale'].astype(str).astype(float)
In [44]:
def time_serieser(df, name, cols=['NumMentions']):
    df['NumMentions'] = df['NumMentions'].astype(str).astype(int)
    df['NumSources'] = df['NumSources'].astype(str).astype(int)
    df['NumArticles'] = df['NumArticles'].astype(str).astype(int)
    df['AvgTone'] = df['AvgTone'].astype(str).astype(float)
    df['GoldsteinScale'] = df['GoldsteinScale'].astype(str).astype(float)
    df_segment = df[(df['SOURCEURL'].str.contains(name.split('-')[0].lower())) |
                    (df['SOURCEURL'].str.contains(name.split('-')[1].lower())) |
                    (df['SOURCEURL'].str.contains(name.lower()))][['SQLDATE',
                                                                   'Actor1Name',
                                                                   'Actor2Name', 'NumMentions',
                                                                   'NumSources', 'NumArticles',
                                                                   'AvgTone', 'GoldsteinScale']]
    if 'MentTone' in cols:
        df_segment['MentTone'] = (
            df_segment['NumSources']*df_segment['AvgTone'])
    for col in cols:
        results = (pd.Series(df_segment.groupby(
            'SQLDATE').sum()[col].compute()))
    return results

Mentions

In [13]:
hc = time_serieser(df, name = 'hillary-clinton', cols = ['NumMentions'])
In [14]:
with open('clinton_n_mentions.pkl','wb') as f:
    pickle.dump(hc,f)
In [15]:
dt = time_serieser(df, name = 'donald-trump', cols = ['NumMentions'])
In [16]:
with open('trump_n_mentions.pkl','wb') as f:
    pickle.dump(dt,f)
In [17]:
js = time_serieser(df, name = 'jill-stein', cols = ['NumMentions'])
In [18]:
with open('stein_n_mentions.pkl','wb') as f:
    pickle.dump(js,f)
In [14]:
gj = time_serieser(df, name = 'gary-johnson', cols = ['NumMentions'])
In [16]:
import pickle
with open('johnson_n_mentions.pkl','wb') as f:
    pickle.dump(gj,f)

Sources

In [22]:
hc = time_serieser(df, name = 'hillary-clinton', cols = ['NumSources'])
In [23]:
import pickle
with open('clinton_n_sources.pkl','wb') as f:
    pickle.dump(hc,f)
In [24]:
dt = time_serieser(df, name = 'donald-trump', cols = ['NumSources'])
In [25]:
with open('trump_n_sources.pkl','wb') as f:
    pickle.dump(dt,f)
In [26]:
js = time_serieser(df, name = 'jill-stein', cols = ['NumSources'])
In [27]:
with open('stein_n_sources.pkl','wb') as f:
    pickle.dump(js,f)
In [28]:
gj = time_serieser(df, name = 'gary-johnson', cols = ['NumSources'])
In [29]:
with open('johnson_n_sources.pkl','wb') as f:
    pickle.dump(gj,f)

Articles

In [30]:
hc = time_serieser(df, name = 'hillary-clinton', cols = ['NumArticles'])
In [31]:
import pickle
with open('clinton_n_articles.pkl','wb') as f:
    pickle.dump(hc,f)
In [32]:
dt = time_serieser(df, name = 'donald-trump', cols = ['NumArticles'])
In [33]:
with open('trump_n_articles.pkl','wb') as f:
    pickle.dump(dt,f)
In [34]:
js = time_serieser(df, name = 'jill-stein', cols = ['NumArticles'])
In [35]:
with open('stein_n_articles.pkl','wb') as f:
    pickle.dump(js,f)
In [36]:
gj = time_serieser(df, name = 'gary-johnson', cols = ['NumArticles'])
In [37]:
with open('johnson_n_articles.pkl','wb') as f:
    pickle.dump(gj,f)

AvgTone

In [12]:
hc = time_serieser(df, name = 'hillary-clinton', cols = ['MentTone'])
In [13]:
import pickle
with open('clinton_n_menttone.pkl','wb') as f:
    pickle.dump(hc,f)
In [14]:
dt = time_serieser(df, name = 'donald-trump', cols = ['MentTone'])
In [15]:
with open('trump_n_menttone.pkl','wb') as f:
    pickle.dump(dt,f)
In [16]:
js = time_serieser(df, name = 'jill-stein', cols = ['MentTone'])
In [17]:
with open('stein_n_menttone.pkl','wb') as f:
    pickle.dump(js,f)
In [18]:
gj = time_serieser(df, name = 'gary-johnson', cols = ['MentTone'])
In [12]:
hc = time_serieser(df, name = 'hillary-clinton', cols = ['MentTone'])
In [13]:
import pickle
with open('clinton_n_menttone.pkl','wb') as f:
    pickle.dump(hc,f)
In [14]:
dt = time_serieser(df, name = 'donald-trump', cols = ['MentTone'])
In [15]:
with open('trump_n_menttone.pkl','wb') as f:
    pickle.dump(dt,f)
In [16]:
js = time_serieser(df, name = 'jill-stein', cols = ['MentTone'])
In [17]:
with open('stein_n_menttone.pkl','wb') as f:
    pickle.dump(js,f)
In [18]:
gj = time_serieser(df, name = 'gary-johnson', cols = ['MentTone'])
In [19]:
with open('johnson_n_menttone.pkl','wb') as f:
    pickle.dump(gj,f)
In [19]:
with open('johnson_n_menttone.pkl','wb') as f:
    pickle.dump(gj,f)

MentTone

In [12]:
hc = time_serieser(df, name = 'hillary-clinton', cols = ['MentTone'])
In [13]:
import pickle
with open('clinton_n_menttone.pkl','wb') as f:
    pickle.dump(hc,f)
In [14]:
dt = time_serieser(df, name = 'donald-trump', cols = ['MentTone'])
In [15]:
with open('trump_n_menttone.pkl','wb') as f:
    pickle.dump(dt,f)
In [16]:
js = time_serieser(df, name = 'jill-stein', cols = ['MentTone'])
In [17]:
with open('stein_n_menttone.pkl','wb') as f:
    pickle.dump(js,f)
In [18]:
gj = time_serieser(df, name = 'gary-johnson', cols = ['MentTone'])

AvgTone

In [12]:
hc = time_serieser(df, name = 'hillary-clinton', cols = ['AvgTone'])
In [13]:
import pickle
with open('clinton_n_menttone.pkl','wb') as f:
    pickle.dump(hc,f)
In [14]:
dt = time_serieser(df, name = 'donald-trump', cols = ['MentTone'])
In [15]:
with open('trump_n_menttone.pkl','wb') as f:
    pickle.dump(dt,f)
In [16]:
js = time_serieser(df, name = 'jill-stein', cols = ['MentTone'])
In [17]:
with open('stein_n_menttone.pkl','wb') as f:
    pickle.dump(js,f)
In [18]:
gj = time_serieser(df, name = 'gary-johnson', cols = ['MentTone'])
In [19]:
with open('johnson_n_menttone.pkl','wb') as f:
    pickle.dump(gj,f)
In [19]:
with open('johnson_n_menttone.pkl','wb') as f:
    pickle.dump(gj,f)

ArtiTone

In [25]:
hc = time_serieser(df, name = 'hillary-clinton', cols = ['ArtiTone'])
In [26]:
import pickle
with open('clinton_n_artitone.pkl','wb') as f:
    pickle.dump(hc,f)
In [27]:
dt = time_serieser(df, name = 'donald-trump', cols = ['ArtiTone'])
In [28]:
with open('trump_n_artitone.pkl','wb') as f:
    pickle.dump(dt,f)
In [29]:
js = time_serieser(df, name = 'jill-stein', cols = ['ArtiTone'])
In [30]:
with open('stein_n_artitone.pkl','wb') as f:
    pickle.dump(js,f)
In [31]:
gj = time_serieser(df, name = 'gary-johnson', cols = ['ArtiTone'])
In [32]:
import pickle
with open('johnson_n_artitone.pkl','wb') as f:
    pickle.dump(gj,f)

SourceTone

In [36]:
hc = time_serieser(df, name = 'hillary-clinton', cols = ['SourceTone'])
In [37]:
import pickle
with open('clinton_n_sourcetone.pkl','wb') as f:
    pickle.dump(hc,f)
In [38]:
dt = time_serieser(df, name = 'donald-trump', cols = ['SourceTone'])
In [39]:
with open('trump_n_sourcetone.pkl','wb') as f:
    pickle.dump(dt,f)
In [40]:
js = time_serieser(df, name = 'jill-stein', cols = ['SourceTone'])
In [41]:
with open('stein_n_sourcetone.pkl','wb') as f:
    pickle.dump(js,f)
In [42]:
gj = time_serieser(df, name = 'gary-johnson', cols = ['SourceTone'])
In [43]:
import pickle
with open('johnson_n_sourcetone.pkl','wb') as f:
    pickle.dump(gj,f)

GoldsteinScale

In [46]:
hc = time_serieser(df, name = 'hillary-clinton', cols = ['GoldsteinScale'])
In [47]:
import pickle
with open('clinton_n_goldstein.pkl','wb') as f:
    pickle.dump(hc,f)
In [48]:
dt = time_serieser(df, name = 'donald-trump', cols = ['GoldsteinScale'])
In [49]:
with open('trump_n_goldstein.pkl','wb') as f:
    pickle.dump(dt,f)
In [51]:
js = time_serieser(df, name = 'jill-stein', cols = ['GoldsteinScale'])
In [52]:
with open('stein_n_goldstein.pkl','wb') as f:
    pickle.dump(js,f)
In [53]:
gj = time_serieser(df, name = 'gary-johnson', cols = ['GoldsteinScale'])
In [54]:
import pickle
with open('johnson_n_goldstein.pkl','wb') as f:
    pickle.dump(gj,f)
In [ ]: