導入 pandas(主要統計模組), matplotlib(製作圖表用) 及 numpy(計算時有可能會用到):
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
plt.rcParams['axes.unicode_minus']=False
sns.set_style("whitegrid")
sns.set_context("poster")
sns.set(font='SimHei', font_scale=1)
IPython 的指令,把圖表直接顯示在 notebook Output 裡
%matplotlib inline
把事先整合好的數據導入:
legco_cm = pd.read_csv('./cm/legco_cm_summary3.csv')
legco_cm.head()
大概看一下數據,有 ID,日期,時間,動議,動議人,動議種類,結果,地區出席(出席沒投票),地區投票,地區贊成票,地區反對票,地區棄權票,功能組別出席(出席沒投票),投票,贊成票,反對票,棄權票,全體立法會出席(出席沒投票),投票,贊成票,反對票,棄權票,然後就是各個議員在各個動議的投票狀態。
legco_cm.dtypes
legco_cm.columns
平均一次投票參與人數:
legco_cm['overall-present'].mean()
平均投票人數:(主席不投票、當中亦有連棄權也不選的情況)
legco_cm['overall-vote'].mean()
passed = (legco_cm['result'] == 'Passed').sum()
print(f'紀錄動議總數: {legco_cm.shape[0]}')
print(f'獲通過的動議: {passed}')
print(f'總通過率: {passed / legco_cm.shape[0] * 100:.1f}%')
labels = ['Passed', 'Negatived']
size = [passed, legco_cm.shape[0] - passed]
fig1, ax1 = plt.subplots()
ax1.pie(size, labels=labels, autopct='%1.1f%%', startangle=90, textprops={'fontsize': 20})
ax1.axis('equal')
ax1.set_title('For all 923 Motions:')
plt.rcParams['figure.figsize'] = (10, 10)
plt.show()
紀錄中把動議分為立法會成員及公務員兩種。
legco_cm['mover-type'].unique()
member_motion = legco_cm[(legco_cm['mover-type']=='Member')].result.count()
member_motion_passed = legco_cm[(legco_cm['mover-type']=='Member') & (legco_cm['result']=='Passed')].result.count()
gov_motion = legco_cm[(legco_cm['mover-type']=='Public Officer')].result.count()
gov_motion_passed = legco_cm[(legco_cm['mover-type']=='Public Officer') & (legco_cm['result']=='Passed')].result.count()
print(f"由議員提出的議案:{member_motion},獲得通過:{member_motion_passed},通過率:{member_motion_passed / member_motion:.3f}")
print(f"由政府提出的議案:{gov_motion},獲得通過:{gov_motion_passed},通過率:{gov_motion_passed / gov_motion:.3f}")
# Bar Chart:
negatived = [(gov_motion - gov_motion_passed), (member_motion - member_motion_passed)]
passed = [gov_motion_passed, member_motion_passed]
p1 = plt.bar([0, 1], passed, 0.35, alpha=0.5)
p2 = plt.bar([0, 1], negatived, 0.35, bottom=passed, alpha=0.5)
plt.ylabel('Number of Motions')
plt.title('Number of Motions by Member/Public Officer')
plt.xticks([0, 1], ('Public Officer', 'Members'))
plt.legend((p1[0], p2[0]), ('Passed', 'Negatived'))
plt.rcParams['figure.figsize'] = (10, 10)
plt.show()
print("Alternative: Pie charts")
labels = ['Passed', 'Negatived']
member_size = [member_motion_passed, member_motion - member_motion_passed]
gov_size = [gov_motion_passed, gov_motion - gov_motion_passed]
fig, ax = plt.subplots(1, 2)
fig.subplots_adjust(hspace=0.5, wspace=0.5)
ax[1].pie(member_size, labels=labels, autopct='%1.1f%%', startangle=90, textprops={'fontsize': 20})
ax[1].set_title('Motions by Members')
ax[0].pie(gov_size, labels=labels, autopct='%1.1f%%', startangle=0, textprops={'fontsize': 20})
ax[0].set_title('Motions by Goverment')
plt.show()
投票結果:
oops = legco_cm[(legco_cm['mover-type']=='Public Officer') & (legco_cm['result']=='Negatived')]
oops[['motion', 'mover', 'overall-vote', 'overall-yes', 'overall-no']]
legco_cm['vote-date'] = pd.to_datetime(legco_cm['vote-date'])
legco_cm['vote-time'] = pd.to_datetime(legco_cm['vote-time']).dt.time
legco_cm[['vote-id', 'vote-date', 'vote-time', 'mover-type', 'result']].head()
首先看一下政府動議按月份分怖
legco_cm['year-month'] = legco_cm['vote-date'].dt.to_period('M')
legco_cm[legco_cm['mover-type'] == 'Public Officer'].groupby('year-month').size().plot(kind='bar')
整合起來再看一次...
legco_cm['month'] = legco_cm['vote-date'].dt.month
legco_cm[legco_cm['mover-type'] == 'Public Officer'].groupby('month').size().plot(kind='bar')
看來大部份政府議案都在5、6、10、11月提出。
legco_cm['day-of-week'] = legco_cm['vote-date'].dt.dayofweek
legco_cm[legco_cm['mover-type'] == 'Public Officer'].groupby('day-of-week').size().plot(kind='bar')
主要是週三、週四,其次是週五。
legco_cm['hour'] = pd.to_datetime(legco_cm['vote-time'].astype('str')).dt.hour
legco_cm[legco_cm['mover-type'] == 'Public Officer'].groupby('hour').size().plot(kind='bar')
看來午飯 (12 時)及晚飯/下班前(18 時)的動議數字比較高....
legco_cm[(legco_cm['overall-yes'] > legco_cm['overall-no']) & (legco_cm['result'] == 'Negatived')].shape[0]
佔由議員動議而被否決的 15.51 %
梁頌恆及游蕙禎沒有參與過立法會會議就被取消資格。
members = ['梁君彥', '涂謹申', '梁耀忠', '石禮謙', '張宇人', '李國麟', '林健鋒', '黃定光', '李慧琼',
'陳克勤', '陳健波', '梁美芬', '黃國健', '葉劉淑儀', '謝偉俊', '毛孟靜', '田北辰', '何俊賢',
'易志明', '胡志偉', '姚思榮', '馬逢國', '莫乃光', '陳志全', '陳恒鑌', '梁志祥', '梁繼昌',
'麥美娟', '郭家麒', '郭偉强', '郭榮鏗', '張華峰', '張超雄', '黃碧雲', '葉建源', '葛珮帆',
'廖長江', '潘兆平', '蔣麗芸', '盧偉國', '鍾國斌', '楊岳橋', '尹兆堅', '朱凱廸', '吳永嘉',
'何君堯', '何啟明', '林卓廷', '周浩鼎', '邵家輝', '邵家臻', '柯創盛', '容海恩', '陳沛然',
'陳振英', '陳淑莊', '張國鈞', '許智峯', '陸頌雄', '劉國勳', '劉業強', '鄭松泰', '鄺俊宇',
'譚文豪', '范國威', '區諾軒', '鄭泳舜', '謝偉銓', '陳凱欣', '梁國雄', '羅冠聰', '姚松炎',
'劉小麗']
# '梁頌恆', '游蕙禎'
print(legco_cm[(legco_cm['mover-type']=='Member')].mover.unique())
print(len(legco_cm[(legco_cm['mover-type']=='Member')].mover.unique()), '人')
把動議數、通過 / 否決數作成一個圖表
move_count = legco_cm[(legco_cm['mover-type']=='Member')].groupby('mover').size().reset_index(name='counts').sort_values('counts')
move_pass = legco_cm[(legco_cm['mover-type']=='Member') & (legco_cm['result']=='Passed')].groupby('mover').size().reset_index(name='passed')
move_neg = legco_cm[(legco_cm['mover-type']=='Member') & (legco_cm['result']=='Negatived')].groupby('mover').size().reset_index(name='negatived')
move_count = pd.merge(move_count, move_pass, on='mover', how='outer')
move_count = pd.merge(move_count, move_neg, on='mover', how='outer')
move_count[['passed', 'negatived']] = move_count[['passed', 'negatived']].fillna(0).astype('int')
plt.rcParams['figure.figsize'] = (10, 20)
inp = np.arange(move_count.shape[0])
p1 = plt.barh(inp, move_count['passed'], 1, alpha=0.7)
p2 = plt.barh(inp, move_count['negatived'], 1, left=move_count['passed'], alpha=0.7)
plt.title('Number of Motions by Member')
plt.yticks(inp, move_count['mover'])
plt.legend((p1[0], p2[0]), ('Passed', 'Negatived'), loc='center right')
plt.show()
作圖的話 73 名議員太長不太好看...
df = pd.DataFrame([legco_cm.groupby(member).size() for member in members])
df.fillna(0).astype('int')
df['member'] = members
df['vote_num'] = df.fillna(0)['Yes'] + df.fillna(0)['No'] + df.fillna(0)['Abstain']
df['vote_rate'] = df['vote_num'] / (df['vote_num'] + df.fillna(0)['Present'] + df.fillna(0)['Absent']) * 100
df.sort_values('vote_rate').head(10).fillna(0)
member_summary = pd.merge(move_count, df[['Absent', 'Present', 'vote_num', 'vote_rate', 'member']],
right_on='member',
left_on='mover',
how='outer')
member_summary = member_summary.drop(columns=['mover']).fillna(0)
member_summary.head(10)
loser = []
for name in legco_cm[(legco_cm['mover-type']=='Member')].mover.unique():
if name not in legco_cm[(legco_cm['mover-type']=='Member') & (legco_cm['result']=='Passed')].mover.unique():
loser.append(name)
print('Loser name list: ', loser)
print(len(loser), '人')
member_summary[member_summary['member'].isin(loser)]
print(legco_cm[(legco_cm['mover-type']=='Member') & (legco_cm['result']=='Passed')].mover.unique())
print(len(legco_cm[(legco_cm['mover-type']=='Member') & (legco_cm['result']=='Passed')].mover.unique()), '人')
member_summary[~member_summary['member'].isin(loser)]
member_summary[member_summary['counts'] == 0].sort_values(['Absent'], ascending=False)
能看得出 劉業強、鍾國斌、石禮謙、不但完全沒動議,連投票率也不到 60%
member_summary.sort_values('vote_num', ascending=False).head(10)
member_summary.sort_values('vote_rate', ascending=True).head(10)
從 Wikipedia 上找到各個議員的所屬政黨
from bs4 import BeautifulSoup
with open('./party.html', 'r') as f:
party = f.read()
party_soup = BeautifulSoup(party, 'html.parser')
tr = party_soup.find_all('tr')
tr_text = []
for i in tr:
j = i.text.strip().split('\n\n')
a = j[0].split('\n')[-1]
b = j[-1].split('\n')[0]
tr_text.append([a, b])
tr_text.remove(['備註', '席位'])
tr_text.remove(['懸空', ''])
for _ in range(3):
tr_text.remove(['懸空', '懸空'])
party_pd = pd.DataFrame(tr_text)
# correct error due to format
party_pd.loc[party_pd[0] == '吳永嘉', 1] = '經民聯'
party_pd.loc[party_pd[0] == '邵家輝', 1] = '自由黨'
# add missing members
missing = pd.DataFrame([{0: '張華峰', 1: '經民聯'},
{0: '何啟明', 1: '工聯會'},
{0: '范國威', 1: '香港本土'},
{0: '區諾軒', 1: '獨立民主派'},
{0: '梁國雄', 1: '社民連'},
{0: '羅冠聰', 1: '眾志'},
{0: '姚松炎', 1: '專業議政'},
{0: '劉小麗', 1: '工黨'},
# {0: '梁頌恆', 1: '青年新政'},
# {0: '游蕙禎', 1: '青年新政'}
])
party_pd = party_pd.append(missing, ignore_index=True)
party_pd.replace('公民黨/專業議政', '公民黨', inplace=True)
party_pd.replace('教協/專業議政', '專業議政', inplace=True)
party_pd.replace('民建聯/新界社團聯會', '民建聯', inplace=True)
party_pd.replace('新民黨/公民力量', '新民黨', inplace=True)
party_pd.replace('公專聯/專業議政', '公專聯', inplace=True)
party_pd.replace('經民聯/西九新動力', '經民聯', inplace=True)
party_pd[1].unique()
party_pd.head()
member_summary_party = pd.merge(member_summary, party_pd, left_on='member', right_on=0, how='left')
member_summary_party.drop(columns=0, inplace=True)
member_summary_party
pty_vote_rate = member_summary_party.groupby(1).vote_rate.mean().sort_values()
plt.rcParams['figure.figsize'] = (10, 10)
ax = pty_vote_rate.plot(kind='barh', alpha=0.7, title='各政黨的平均投票率')
ax.set_xlabel('投票率(%)')
ax.set_ylabel("政黨")
ax.axvline(x=50, color='red', ls='--', alpha=1, label='50%')
為分析各政黨議員投票的統一性,我們先定義一個統一性的分數。
$$Score = \frac{A (Yes - No)^2 + B (Yes - Abstain)^2 + C (No - Abstain)^2}{(Yes + No + Abstain)^2} $$Score = 1 為同一次投票內選擇完全一致(不包括缺席),越分散分數越低。
就每一次的投票結果而言 Yes 和 No 及 Abstain 是對立的,但 No 和 Abstain 雖然立場有不同但做成結果一致,所以把 Yes-No 和 Yes-Abstain 的比重 (A, B) 設成 2,而 No-Abstain (C) 則設成 0.5。把 function 寫成可以改變比重的模式方便日後(反悔時)調整。
legco_member_vote = legco_cm[members]
def diff_vote(inputList, weight=[2, 2, 0.5]):
yes = 0
no = 0
abstain = 0
for vote in inputList:
if vote == 'Yes':
yes += 1
elif vote == 'No':
no += 1
elif vote == 'Abstain':
abstain += 1
if (yes**2 + no**2 + abstain**2) > 0:
diff = 1 - (weight[0] * 2 * (yes * no) + weight[2] * 2 * (no * abstain) + weight[1] * 2 * (yes * abstain)) / (yes + no + abstain)**2
else:
diff = np.nan
return diff
測試:
party_member_list = list(party_pd.groupby(1).get_group('民建聯')[0])
test = []
for i in range(legco_member_vote.shape[0]):
diff = diff_vote(legco_member_vote[(member for member in legco_member_vote.columns if member in party_member_list)].loc[i])
test.append(diff)
legco_cm['民建聯'] = test
print(legco_cm['民建聯'].describe())
legco_cm[legco_cm['民建聯'] < 1][[member for member in legco_cm.columns if member in party_member_list]+['motion']]
如果每一個政黨都要把投票不一的議題全部列出太貼位置,把迴圈修改為只列出 motion id。同時亦把只有一名立法會議員的政黨除掉。
party_vote_summary = []
for party in party_pd[1].unique():
party_member_list = list(party_pd.groupby(1).get_group(party)[0])
test = []
diff_list = []
for i in range(legco_member_vote.shape[0]):
diff = diff_vote(legco_member_vote[(member for member in legco_member_vote.columns if member in party_member_list)].loc[i])
test.append(diff)
if diff < 1:
diff_list.append(i)
legco_cm[party] = test
party_vote = {'party-name': party,
'num-members': len([member for member in legco_member_vote.columns if member in party_member_list]),
'involed-num-motion': legco_cm[party].count(),
'num-motion-not-unify': len(diff_list),
'motion-list': diff_list,
'score-mean': legco_cm[party].mean(),
'score-sd': legco_cm[party].std()
}
party_vote_summary.append(party_vote)
party_vote_summary_df = pd.DataFrame(party_vote_summary)
party_vote_summary_df[party_vote_summary_df['num-members'] > 1].sort_values('score-mean')
# print(legco_cm[legco_cm[party] < 1][[member for member in legco_cm.columns if member in party_member_list]+['motion']])
可以看到以獨立民主派分歧最大,新民黨最統一(但只有 2 人),其次是工聯會和民建聯。
party_vote_summary = party_vote_summary_df[party_vote_summary_df['num-members'] > 1].sort_values('score-mean')
inp = np.arange(party_vote_summary.shape[0])
plt.barh(inp, party_vote_summary['score-mean'], alpha=0.7)
plt.title('Voting Score of Parties')
plt.yticks(inp, party_vote_summary['party-name'])
plt.show()
分類根據 wikipedia 對建制和泛民的定義
proBJ = ['民建聯', '工聯會', '經民聯', '自由黨', '新民黨', '實政圓桌', '新論壇', '勞聯']
proDem = ['民主黨', '公民黨', '工黨', '街工', '公專聯', '人民力量', '社民連']
all_parties = ['新民黨', '工聯會', '民主黨', '民建聯', '公民黨', '經民聯', '香港本土', '獨立建制派', '獨立民主派',
'實政圓桌', '熱血公民', '工黨', '人民力量', '自由黨', '專業議政', '公專聯', '獨立中間派', '勞聯',
'新論壇', '街工', '社民連', '眾志']
def getPartyMember(partyList):
memberList = []
for party in partyList:
memberList += list(party_pd.groupby(1).get_group(party)[0])
return memberList
side_vote_summary = []
side = [{'name': '建制', 'party-list': proBJ},
{'name': '泛民', 'party-list': proDem},
{'name': '立法會全體', 'party-list': all_parties}
]
for s in side:
party_member_list = getPartyMember(s['party-list'])
test = []
diff_list = []
for i in range(legco_member_vote.shape[0]):
diff = diff_vote(legco_member_vote[(member for member in legco_member_vote.columns if member in party_member_list)].loc[i])
test.append(diff)
if diff < 1:
diff_list.append(i)
legco_cm[s['name']] = test
party_vote = {'party-name': s['name'],
'num-members': len([member for member in legco_member_vote.columns if member in party_member_list]),
'involed-num-motion': legco_cm[s['name']].count(),
'num-motion-not-unify': len(diff_list),
'motion-list': diff_list,
'score-mean': legco_cm[s['name']].mean(),
'score-sd': legco_cm[s['name']].std()
}
side_vote_summary.append(party_vote)
side_summary_df = pd.DataFrame(side_vote_summary)
side_summary_df
inp = np.arange(side_summary_df.shape[0])
plt.bar(inp, side_summary_df['score-mean'], alpha=0.7)
plt.title('Voting Score of Parties')
plt.xticks(inp, ['建制', '泛民', '立法會全體'])
plt.show()
在把立法會議員分成建制及泛民兩邊時我們不難發現建制派有著人數上的優勢,如果把獨立建制派也加在內的話總人數達 43 人,也就是 2/3 的總人數。但是把投票並沒有完全統一及缺席也加算在內的話,在議員動議統計也不難發現建制派的動議被否決或是泛民派的動議通過。在這部份我們將探討一下有多少動議是如果泛民派足夠團結/全員出席後有機會改變結果。
proBJ += ['獨立建制派']
proBjMember = [x for x in members if (member_summary_party[(member_summary_party['member'] == x)][1].values in proBJ)]
proBJPass = np.zeros(legco_member_vote.shape[0])
proBJNeg = np.zeros(legco_member_vote.shape[0])
proBJAbs = np.zeros(legco_member_vote.shape[0])
for i in range(legco_member_vote.shape[0]):
p = 0
n = 0
absend = 0
for member in proBjMember:
if legco_member_vote[member].iloc[i] in ['No', 'Astain']:
n += 1
elif legco_member_vote[member].iloc[i] == 'Yes':
p += 1
else:
absend += 1
proBJPass[i] = p
proBJNeg[i] = n
proBJAbs[i] = absend
legco_cm['Pro-BJ-pass'] = proBJPass
legco_cm['Pro-BJ-neg'] = proBJNeg
legco_cm['Pro-BJ-abs'] = proBJAbs
legco_cm[['Pro-BJ-pass', 'Pro-BJ-neg', 'Pro-BJ-abs', 'overall-yes', 'overall-no', 'result']].head()
假如加上獨立民立派 4 名議員,泛民主派議員人數一共 23 人,查看一下有沒有 23 票能改變的議題。我們先把建制派的投票中支持票及反對票差別少於 23 票的動議找出來。(由議員提出的動議需要分組投票才能通過,為簡化過程這部份我們只看由政府提出的動議。)
legco_cm['pro-BJ-diff'] = abs(legco_cm['Pro-BJ-pass'] - legco_cm['Pro-BJ-neg'])
legco_cm[(legco_cm['pro-BJ-diff'] <= 23) & (legco_cm['mover-type'] == 'Public Officer')][['vote-id', 'Pro-BJ-pass', 'Pro-BJ-neg', 'overall-vote', 'overall-yes', 'result']]
當中由政府提出的 20170517163 號及 20180509067 號看來是泛民主派有機會以及想要拉倒的動議。
legco_cm[legco_cm['vote-id'] == '20170517163'][['motion', 'mover', 'overall-vote', 'overall-yes', 'Pro-BJ-pass', 'overall-no']]
legco_cm[legco_cm['vote-id'] == '20180509067'][['motion', 'mover', 'overall-vote', 'overall-yes', 'Pro-BJ-pass', 'overall-no']]
都是年度撥款條例草案,然而參與投票不足下未能達到想要的結果。(當然在現實建制派發現泛民主派投票人數增加也有動員參加投票的可能,所以並不能說現實上如果泛民主派努力一點團結一點就能成功。在這裡只是以這個假想情況作例子示範一下 pandas 如何幫我們找到想要的數據。)
在這個部份我們試著把各議員的投票傾向分類和視像化。首先我們會用上面提及的評分方法計算議員之間的距離評分,議員之間投票的相似度可以用 heat map 來視像化。然後我們會用 Multidimensional Scaling (MDS)方法把不易看懂的 heat map 轉換成二維坐標圖分析。
這裡以之前定義的評分來計算投票距離 (1-Score),使 0 為最近,投票方向越不同距離越大。
member_vote = legco_cm[members].drop(columns='梁君彥') # drop him as he is the chairman of cm who did not vote at all
# This script took more then 30 mins to finish in my notebook...
matrix = []
timer = 0
print(str(timer) + "/73", end=' ')
for member1 in member_vote.columns:
mem_dict = {}
for member2 in member_vote.columns:
diff_list = np.zeros(member_vote.shape[0])
for i in range(member_vote.shape[0]):
diff = diff_vote(member_vote[[member1, member2]].loc[i])
diff_list[i] = diff
mem_dict[member2] = 1 - np.nanmean(diff_list)
matrix.append(mem_dict)
timer += 1
print(str(timer) + "/73", end=' ') # just to enusre the program is running
print()
matrix_df = pd.DataFrame(matrix)
把 index 換成名字。
rename_dict = {}
i = 0
for member1 in member_vote.columns:
rename_dict[i] = member1
i += 1
matrix_df.rename(rename_dict, inplace=True)
matrix_df.head()
把距陣以 heatmap 展視。(基本上看不出什麼...)
ax = sns.heatmap(matrix_df, vmin=0, vmax=1)
導入 scikit learn 的 MDS 模組。
from sklearn.manifold import MDS
model = MDS(n_components=2, dissimilarity='precomputed', random_state=1)
out = model.fit_transform(matrix_df)
member_scatt = pd.DataFrame({'member': matrix_df.columns,
'x': out[:, 0],
'y': out[:, 1]})
member_scatt = member_scatt.merge(member_summary_party[['member', 1]], on='member')
member_scatt.head()
plt.rcParams['figure.figsize'] = (15, 15)
sns.scatterplot(member_scatt['x'], member_scatt['y'], hue=member_scatt[1], s=100)
def label_point(x, y, val, ax):
for i in range(len(x)):
ax.text(x[i]+.005, y[i]-0.002, str(val[i]))
label_point(out[:, 0], out[:, 1], matrix_df.columns, plt.gca())
在這個圖可以看出建制派整體的投票比較一致,泛民主派雖然投票傾向遠離建制派,但分怖比較離散。