In The News: Predicting Winning Political Candidates From News Tone¶

LT 13 | Basak, Chua, Danao, Roberto

Data Wrangling¶

import numpy as np
import dask as da
import dask.dataframe as dd
import matplotlib.pyplot as plt
import datetime
import pandas as pd

from dask.diagnostics import ProgressBar
pbar = ProgressBar()
pbar.register()

from dask.distributed import Client

Client()

df = dd.read_csv('s3://gdelt-open-data/events/201*.export.csv', 
                    delimiter = '\t', assume_missing = True,
                    storage_options={'anon':True}, header=None, dtype= {n:'object' for n in range(0,59)})

ids = pd.read_excel('fieldids.xlsx', sheet_name ='Sheet1')['Field Name'].values

df.columns = ids

len(df.columns)

58

df = df[(df['ActionGeo_CountryCode']=='US') &
        (df['SQLDATE'].str.startswith('2016'))][['SQLDATE','MonthYear','Year',
                                                 'Actor1Code', 'Actor1Name',
                                                 'Actor1CountryCode',
                                                 'Actor2Code',
                                                 'Actor2Name',
                                                 'Actor2CountryCode', 
                                                 'NumMentions', 'NumSources',
                                                 'NumArticles','AvgTone',
                                                 'GoldsteinScale',
                                                 'SOURCEURL']]

df['NumMentions']=df['NumMentions'].astype(str).astype(int)
df['NumSources']=df['NumSources'].astype(str).astype(int)
df['NumArticles']=df['NumArticles'].astype(str).astype(int)
df['AvgTone']=df['AvgTone'].astype(str).astype(float)
df['GoldsteinScale']=df['GoldsteinScale'].astype(str).astype(float)

def time_serieser(df, name, cols=['NumMentions']):
    df['NumMentions'] = df['NumMentions'].astype(str).astype(int)
    df['NumSources'] = df['NumSources'].astype(str).astype(int)
    df['NumArticles'] = df['NumArticles'].astype(str).astype(int)
    df['AvgTone'] = df['AvgTone'].astype(str).astype(float)
    df['GoldsteinScale'] = df['GoldsteinScale'].astype(str).astype(float)
    df_segment = df[(df['SOURCEURL'].str.contains(name.split('-')[0].lower())) |
                    (df['SOURCEURL'].str.contains(name.split('-')[1].lower())) |
                    (df['SOURCEURL'].str.contains(name.lower()))][['SQLDATE',
                                                                   'Actor1Name',
                                                                   'Actor2Name', 'NumMentions',
                                                                   'NumSources', 'NumArticles',
                                                                   'AvgTone', 'GoldsteinScale']]
    if 'MentTone' in cols:
        df_segment['MentTone'] = (
            df_segment['NumSources']*df_segment['AvgTone'])
    for col in cols:
        results = (pd.Series(df_segment.groupby(
            'SQLDATE').sum()[col].compute()))
    return results

Mentions¶

hc = time_serieser(df, name = 'hillary-clinton', cols = ['NumMentions'])

with open('clinton_n_mentions.pkl','wb') as f:
    pickle.dump(hc,f)

dt = time_serieser(df, name = 'donald-trump', cols = ['NumMentions'])

with open('trump_n_mentions.pkl','wb') as f:
    pickle.dump(dt,f)

js = time_serieser(df, name = 'jill-stein', cols = ['NumMentions'])

with open('stein_n_mentions.pkl','wb') as f:
    pickle.dump(js,f)

gj = time_serieser(df, name = 'gary-johnson', cols = ['NumMentions'])

import pickle
with open('johnson_n_mentions.pkl','wb') as f:
    pickle.dump(gj,f)

Sources¶

hc = time_serieser(df, name = 'hillary-clinton', cols = ['NumSources'])

import pickle
with open('clinton_n_sources.pkl','wb') as f:
    pickle.dump(hc,f)

dt = time_serieser(df, name = 'donald-trump', cols = ['NumSources'])

with open('trump_n_sources.pkl','wb') as f:
    pickle.dump(dt,f)

js = time_serieser(df, name = 'jill-stein', cols = ['NumSources'])

with open('stein_n_sources.pkl','wb') as f:
    pickle.dump(js,f)

gj = time_serieser(df, name = 'gary-johnson', cols = ['NumSources'])

with open('johnson_n_sources.pkl','wb') as f:
    pickle.dump(gj,f)

Articles¶

hc = time_serieser(df, name = 'hillary-clinton', cols = ['NumArticles'])

import pickle
with open('clinton_n_articles.pkl','wb') as f:
    pickle.dump(hc,f)

dt = time_serieser(df, name = 'donald-trump', cols = ['NumArticles'])

with open('trump_n_articles.pkl','wb') as f:
    pickle.dump(dt,f)

js = time_serieser(df, name = 'jill-stein', cols = ['NumArticles'])

with open('stein_n_articles.pkl','wb') as f:
    pickle.dump(js,f)

gj = time_serieser(df, name = 'gary-johnson', cols = ['NumArticles'])

with open('johnson_n_articles.pkl','wb') as f:
    pickle.dump(gj,f)

AvgTone¶

hc = time_serieser(df, name = 'hillary-clinton', cols = ['MentTone'])

import pickle
with open('clinton_n_menttone.pkl','wb') as f:
    pickle.dump(hc,f)

dt = time_serieser(df, name = 'donald-trump', cols = ['MentTone'])

with open('trump_n_menttone.pkl','wb') as f:
    pickle.dump(dt,f)

js = time_serieser(df, name = 'jill-stein', cols = ['MentTone'])

with open('stein_n_menttone.pkl','wb') as f:
    pickle.dump(js,f)

gj = time_serieser(df, name = 'gary-johnson', cols = ['MentTone'])

hc = time_serieser(df, name = 'hillary-clinton', cols = ['MentTone'])

import pickle
with open('clinton_n_menttone.pkl','wb') as f:
    pickle.dump(hc,f)

dt = time_serieser(df, name = 'donald-trump', cols = ['MentTone'])

with open('trump_n_menttone.pkl','wb') as f:
    pickle.dump(dt,f)

js = time_serieser(df, name = 'jill-stein', cols = ['MentTone'])

with open('stein_n_menttone.pkl','wb') as f:
    pickle.dump(js,f)

gj = time_serieser(df, name = 'gary-johnson', cols = ['MentTone'])

with open('johnson_n_menttone.pkl','wb') as f:
    pickle.dump(gj,f)

with open('johnson_n_menttone.pkl','wb') as f:
    pickle.dump(gj,f)

MentTone¶

hc = time_serieser(df, name = 'hillary-clinton', cols = ['MentTone'])

import pickle
with open('clinton_n_menttone.pkl','wb') as f:
    pickle.dump(hc,f)

dt = time_serieser(df, name = 'donald-trump', cols = ['MentTone'])

with open('trump_n_menttone.pkl','wb') as f:
    pickle.dump(dt,f)

js = time_serieser(df, name = 'jill-stein', cols = ['MentTone'])

with open('stein_n_menttone.pkl','wb') as f:
    pickle.dump(js,f)

gj = time_serieser(df, name = 'gary-johnson', cols = ['MentTone'])

AvgTone¶

hc = time_serieser(df, name = 'hillary-clinton', cols = ['AvgTone'])

import pickle
with open('clinton_n_menttone.pkl','wb') as f:
    pickle.dump(hc,f)

dt = time_serieser(df, name = 'donald-trump', cols = ['MentTone'])

with open('trump_n_menttone.pkl','wb') as f:
    pickle.dump(dt,f)

js = time_serieser(df, name = 'jill-stein', cols = ['MentTone'])

with open('stein_n_menttone.pkl','wb') as f:
    pickle.dump(js,f)

gj = time_serieser(df, name = 'gary-johnson', cols = ['MentTone'])

with open('johnson_n_menttone.pkl','wb') as f:
    pickle.dump(gj,f)

with open('johnson_n_menttone.pkl','wb') as f:
    pickle.dump(gj,f)

ArtiTone¶

hc = time_serieser(df, name = 'hillary-clinton', cols = ['ArtiTone'])

import pickle
with open('clinton_n_artitone.pkl','wb') as f:
    pickle.dump(hc,f)

dt = time_serieser(df, name = 'donald-trump', cols = ['ArtiTone'])

with open('trump_n_artitone.pkl','wb') as f:
    pickle.dump(dt,f)

js = time_serieser(df, name = 'jill-stein', cols = ['ArtiTone'])

with open('stein_n_artitone.pkl','wb') as f:
    pickle.dump(js,f)

gj = time_serieser(df, name = 'gary-johnson', cols = ['ArtiTone'])

import pickle
with open('johnson_n_artitone.pkl','wb') as f:
    pickle.dump(gj,f)

SourceTone¶

hc = time_serieser(df, name = 'hillary-clinton', cols = ['SourceTone'])

import pickle
with open('clinton_n_sourcetone.pkl','wb') as f:
    pickle.dump(hc,f)

dt = time_serieser(df, name = 'donald-trump', cols = ['SourceTone'])

with open('trump_n_sourcetone.pkl','wb') as f:
    pickle.dump(dt,f)

js = time_serieser(df, name = 'jill-stein', cols = ['SourceTone'])

with open('stein_n_sourcetone.pkl','wb') as f:
    pickle.dump(js,f)

gj = time_serieser(df, name = 'gary-johnson', cols = ['SourceTone'])

import pickle
with open('johnson_n_sourcetone.pkl','wb') as f:
    pickle.dump(gj,f)

GoldsteinScale¶

hc = time_serieser(df, name = 'hillary-clinton', cols = ['GoldsteinScale'])

import pickle
with open('clinton_n_goldstein.pkl','wb') as f:
    pickle.dump(hc,f)

dt = time_serieser(df, name = 'donald-trump', cols = ['GoldsteinScale'])

with open('trump_n_goldstein.pkl','wb') as f:
    pickle.dump(dt,f)

js = time_serieser(df, name = 'jill-stein', cols = ['GoldsteinScale'])

with open('stein_n_goldstein.pkl','wb') as f:
    pickle.dump(js,f)

gj = time_serieser(df, name = 'gary-johnson', cols = ['GoldsteinScale'])

import pickle
with open('johnson_n_goldstein.pkl','wb') as f:
    pickle.dump(gj,f)

In The News: Predicting Winning Political Candidates From News Tone¶

Data Wrangling¶

Client

Cluster

Mentions¶

Sources¶

Articles¶

AvgTone¶

MentTone¶

AvgTone¶

ArtiTone¶

SourceTone¶

GoldsteinScale¶