From 012c40c640bfbe96404c3dd178ae7a9e5cc53ee2 Mon Sep 17 00:00:00 2001 From: linozen Date: Thu, 19 Aug 2021 16:40:21 +0200 Subject: [PATCH] first graph rendering on media (until MSsoc1) --- explorer/media.py | 636 +++++++++++++++++++++++++++++++++++++++------- 1 file changed, 540 insertions(+), 96 deletions(-) diff --git a/explorer/media.py b/explorer/media.py index c5aadff..f8b005d 100644 --- a/explorer/media.py +++ b/explorer/media.py @@ -1,31 +1,96 @@ import streamlit as st import pandas as pd import numpy as np -import phik +import plotly.graph_objects as go import plotly.express as px -import base64 -from io import BytesIO +from pathlib import Path + +from lib.figures import ( + generate_pie_chart, + generate_histogram, + generate_stacked_bar_chart, + generate_ranking_plot, +) + +from lib.download import ( + get_csv_download_link, + get_excel_download_link, +) + + +############################################################################### +# Helper functions +############################################################################### + + +@st.cache +def render_pie_chart( + df, + values, + names, + color=None, + color_discrete_sequence=px.colors.qualitative.Prism, + color_discrete_map=None, + hover_name=None, +): + return generate_pie_chart( + df, + values, + names, + hover_name, + color, + color_discrete_sequence, + color_discrete_map, + ) + + +@st.cache +def render_histogram(df, x, y, nbins, color, color_discrete_map, labels): + return generate_histogram(df, x, y, nbins, color, color_discrete_map, labels) + + +@st.cache +def render_stacked_bar_chart(data): + return generate_stacked_bar_chart(data) + + +@st.cache +def render_ranking_plot(input_col): + return generate_ranking_plot(df[filter], input_col, bodies, scoring) + + +@st.cache +def read_markdown_file(file): + return Path(file).read_text() + + +@st.cache +def get_corr_matrix(df): + df = pd.read_pickle("./data/media_corr.pkl") + fig = px.imshow(df, zmin=0, zmax=1, color_continuous_scale="viridis", height=1300) + return fig + + +@st.cache +def get_significance_matrix(df): + df = pd.read_pickle("./data/media_sig.pkl") + fig = px.imshow(df, zmin=-5, zmax=5, color_continuous_scale="viridis", height=1300) + return fig -# Factors to filter by (CS only) -# ------------------------------ -# MShr1 -# MSgender -# MSattitude1-2 ################################################################################### # General configuration ################################################################################### + + st.set_page_config( page_title="IOI Survey Data Explorer (MS only)", - layout="wide", - initial_sidebar_state="collapsed", ) -st.title("IOI Survey Data Explorer (MS only)") -################################################################################### -# Data acquisition -################################################################################### +############################################################################### +# Data wrangling +############################################################################### @st.cache() @@ -172,6 +237,7 @@ def get_ms_df(): "lastpage", "MShr1", "MShr2", + "MShr3[daily_newspaper]", "MShr3[weekly_newspaper]", "MShr3[magazine]", "MShr3[tv]", @@ -349,15 +415,20 @@ def get_ms_df(): return df -################################################################################### -# Define DataFrame -################################################################################### +############################################################################### +# Define base DataFrame +############################################################################### + + df = get_ms_df() df = df.reset_index(drop=True) -################################################################################### + +############################################################################### # Make answers human-readable -################################################################################### +############################################################################### + + df["MShr1"] = df["MShr1"].replace( { "AO01": "Full-time", @@ -821,9 +892,11 @@ def get_ms_df(): "AO05": "Other", } ) -################################################################################### + + +############################################################################### # Make answers analysable (change data types etc.) -################################################################################### +############################################################################### df["MShr2"] = df["MShr2"].replace("?", np.nan) df["MShr2"] = df["MShr2"].replace("0,5", 0.5) df["MShr2"] = pd.to_numeric(df["MShr2"], errors="coerce") @@ -878,26 +951,17 @@ def get_ms_df(): df[col] = df[col].replace(np.nan, False) df[col] = df[col].replace("Y", True) df[col] = df[col].astype("bool") -################################################################################### + + +############################################################################### # Filter logic -################################################################################### -# TODO filter by attitudes1-2 +############################################################################### + + filters = { "country": st.sidebar.selectbox( "Country", ["All", "United Kingdom", "Germany", "France"] ), - "MShr1": st.sidebar.selectbox( - "Employment status", - [ - "All", - "Full-time", - "Part-time (>50%)", - "Part-time (<50%)", - "Freelance", - "Unpaid", - "Other", - ], - ), } filter = np.full(len(df.index), True) @@ -907,85 +971,463 @@ def get_ms_df(): else: filter = filter & (df[column_name] == selectbox) -################################################################################### -# Provide download links -################################################################################### -# Save a useful snapshot of the merged data + +############################################################################### +# Save a useful snapshot of the data (comment out for production) +############################################################################### + + df.to_pickle("./data/media.pkl") df.to_excel("./data/media.xlsx") df.to_csv("./data/media.csv") -@st.cache -def get_csv_download_link(df): - """Generates a link allowing the data in a given panda dataframe to be downloaded - in: dataframe - out: href string - """ - csv = df.to_csv(index=False) - b64 = base64.b64encode( - csv.encode() - ).decode() # some strings <-> bytes conversions necessary here - href = f'Download as CSV file' - return href - - -def to_excel(df): - output = BytesIO() - writer = pd.ExcelWriter(output, engine="xlsxwriter") - df.to_excel(writer, sheet_name="Sheet1") - writer.save() - processed_data = output.getvalue() - return processed_data +############################################################################### +# Custom CSS +############################################################################### -@st.cache -def get_excel_download_link(df): - """Generates a link allowing the data in a given panda dataframe to be downloaded - in: dataframe - out: href string - """ - val = to_excel(df) - b64 = base64.b64encode(val) - return f'Download as Excel file' +st.markdown( + """ """, + unsafe_allow_html=True, +) -st.write(get_csv_download_link(df), unsafe_allow_html=True) -st.write(get_excel_download_link(df), unsafe_allow_html=True) +############################################################################### +# Display dynamic charts +############################################################################### -################################################################################### -# Display table -################################################################################### -st.dataframe(df[filter], height=1000) -################################################################################### -# Display dynamic charts -################################################################################### -# Correlation matrix +st.title("IOI Survey Data Explorer (MS only)") +st.write("# General") -@st.cache -def get_corr_matrix(df): - df = pd.read_pickle("./data/media_corr.pkl") - fig = px.imshow(df, zmin=0, zmax=1, color_continuous_scale="viridis", height=1300) - return fig +merged_markdown = read_markdown_file("explorer/markdown/media.md") +st.markdown(merged_markdown, unsafe_allow_html=True) +st.write("### Country `[country]`") +country_counts = df[filter]["country"].value_counts() +st.plotly_chart( + render_pie_chart( + df[filter], + values=country_counts, + names=country_counts.index, + ) +) -@st.cache -def get_significance_matrix(df): - df = pd.read_pickle("./data/media_sig.pkl") - fig = px.imshow(df, zmin=-5, zmax=5, color_continuous_scale="viridis", height=1300) - return fig +st.write("# Resources") + +st.write("## Human Resources") + +st.write("### What is your employment status `[MShr1]`") +MShr1_counts = df[filter]["MShr1"].value_counts() +st.plotly_chart( + render_pie_chart( + MShr1_counts, + values=MShr1_counts, + names=MShr1_counts.index, + color=MShr1_counts.index, + color_discrete_map={ + "Full-time": px.colors.qualitative.Prism[0], + "Part-time (>50%)": px.colors.qualitative.Prism[1], + "Part-time (<50%)": px.colors.qualitative.Prism[2], + "Freelance": px.colors.qualitative.Prism[4], + "Other": px.colors.qualitative.Prism[10], + }, + ) +) +st.write( + "### How many days per month do you work on surveillance by intelligence agencies? `[MShr2]`" +) +st.plotly_chart( + render_histogram( + df=df[filter], + x="MShr2", + y=None, + nbins=None, + color="country", + color_discrete_map={ + "Germany": px.colors.qualitative.Prism[5], + "France": px.colors.qualitative.Prism[1], + "United Kingdom": px.colors.qualitative.Prism[7], + }, + labels={"MShr2": "days per month"}, + ) +) + +st.write("### Which type of medium do you work for? `[MShr3]`") +MShr3_df = pd.DataFrame(columns=("option", "count", "country")) +# TODO Map proper labels +for label in [ + "daily_newspaper", + "weekly_newspaper", + "magazine", + "tv", + "radio", + "news_agency", + "online_stand_alone", + "online_of_offline", +]: + MShr3_data = df[filter]["country"][df[f"MShr3[{label}]"] == 1].tolist() + for i in MShr3_data: + MShr3_df = MShr3_df.append( + {"option": label, "count": MShr3_data.count(i), "country": i}, + ignore_index=True, + ) +MShr3_df = MShr3_df.drop_duplicates() + +st.plotly_chart( + render_histogram( + MShr3_df, + x="option", + y="count", + nbins=None, + color="country", + color_discrete_map={ + "Germany": px.colors.qualitative.Prism[5], + "France": px.colors.qualitative.Prism[1], + "United Kingdom": px.colors.qualitative.Prism[7], + }, + labels={"count": "people who work for this medium"}, + ) +) + +st.write( + "### Within the past year, did you have enough time to cover surveillance by intelligence agencies? `[MShr4]`" +) +MShr4_counts = df[filter]["MShr4"].value_counts() +st.plotly_chart( + render_pie_chart( + MShr4_counts, + values=MShr4_counts, + names=MShr4_counts.index, + color=MShr4_counts.index, + color_discrete_map={ + "I had enough time": px.colors.qualitative.Prism[9], + "I had some time": px.colors.qualitative.Prism[8], + "I had very little time": px.colors.qualitative.Prism[7], + "I had no time": px.colors.qualitative.Prism[6], + "I don't know": px.colors.qualitative.Prism[10], + "I prefer not to say": px.colors.qualitative.Prism[10], + }, + ) +) + +st.write("## Expertise") + +st.write( + "### How many years have you spent working on surveillance by intelligence agencies? `[MSexpertise1]`" +) +st.plotly_chart( + render_histogram( + df[filter], + x="MSexpertise1", + y=None, + nbins=20, + color="country", + color_discrete_map={ + "Germany": px.colors.qualitative.Prism[5], + "France": px.colors.qualitative.Prism[1], + "United Kingdom": px.colors.qualitative.Prism[7], + }, + labels={"MSexpertise1": "years"}, + ) +) + +st.write( + "### How do you assess your level of expertise concerning the **legal** aspects of surveillance by intelligence agencies? For example, knowledge of intelligence law, case law. `[MSexpertise2]`" +) +MSexpertise2_counts = df[filter]["MSexpertise2"].value_counts() +st.plotly_chart( + render_pie_chart( + df[filter], + values=MSexpertise2_counts, + names=MSexpertise2_counts.index, + color_discrete_sequence=None, + color=MSexpertise2_counts.index, + color_discrete_map={ + "Expert knowledge": px.colors.qualitative.Prism[9], + "Advanced knowledge": px.colors.qualitative.Prism[8], + "Some knowledge": px.colors.qualitative.Prism[7], + "Basic knowledge": px.colors.qualitative.Prism[6], + "No knowledge": px.colors.qualitative.Prism[5], + "I don't know": px.colors.qualitative.Prism[10], + "I prefer not to say": px.colors.qualitative.Prism[10], + }, + ) +) st.write( - "# Correlation Matrix (Phik `φK`) \nPhik (φk) is a new and practical correlation coefficient that works consistently between categorical, ordinal and interval variables, captures non-linear dependency and reverts to the Pearson correlation coefficient in case of a bivariate normal input distribution. There is extensive documentation available [here](https://phik.readthedocs.io/en/latest/index.html)" + "### How do you assess your level of expertise concerning the **political** aspects of surveillance by intelligence agencies?` [MSexpertise3]`" ) +MSexpertise3_counts = df[filter]["MSexpertise3"].value_counts() +st.plotly_chart( + render_pie_chart( + df[filter], + values=MSexpertise3_counts, + names=MSexpertise3_counts.index, + color_discrete_sequence=None, + color=MSexpertise3_counts.index, + color_discrete_map={ + "Expert knowledge": px.colors.qualitative.Prism[9], + "Advanced knowledge": px.colors.qualitative.Prism[8], + "Some knowledge": px.colors.qualitative.Prism[7], + "Basic knowledge": px.colors.qualitative.Prism[6], + "No knowledge": px.colors.qualitative.Prism[5], + "I don't know": px.colors.qualitative.Prism[10], + "I prefer not to say": px.colors.qualitative.Prism[10], + }, + ) +) + +st.write( + "### How do you assess your level of expertise concerning the **technical** aspects of surveillance by intelligence agencies?` [MSexpertise4]`" +) +MSexpertise4_counts = df[filter]["MSexpertise4"].value_counts() +st.plotly_chart( + render_pie_chart( + df[filter], + values=MSexpertise4_counts, + names=MSexpertise4_counts.index, + color_discrete_sequence=None, + color=MSexpertise4_counts.index, + color_discrete_map={ + "Expert knowledge": px.colors.qualitative.Prism[9], + "Advanced knowledge": px.colors.qualitative.Prism[8], + "Some knowledge": px.colors.qualitative.Prism[7], + "Basic knowledge": px.colors.qualitative.Prism[6], + "No knowledge": px.colors.qualitative.Prism[5], + "I don't know": px.colors.qualitative.Prism[10], + "I prefer not to say": px.colors.qualitative.Prism[10], + }, + ) +) + +st.write("## Financial Resources") + +st.write( + "### How do you assess the financial resources that have been available for your work on intelligence over the past 5 years? `[MSfinance1]`" +) +MSfinance1_counts = df[filter]["MSfinance1"].value_counts() +st.plotly_chart( + render_pie_chart( + df[filter], + values=MSfinance1_counts, + names=MSfinance1_counts.index, + color_discrete_sequence=None, + color=MSfinance1_counts.index, + color_discrete_map={ + "A great deal of funding": px.colors.qualitative.Prism[9], + "Sufficient funding": px.colors.qualitative.Prism[8], + "Some funding": px.colors.qualitative.Prism[7], + "Little funding": px.colors.qualitative.Prism[6], + "No funding": px.colors.qualitative.Prism[5], + "I don't know": px.colors.qualitative.Prism[10], + "I prefer not to say": px.colors.qualitative.Prism[10], + }, + ) +) + +st.write( + "### If you wanted to conduct investigative research into surveillance by intelligence agencies, could you access extra funding for this research? (For example, a special budget or a stipend) `[MSfinance2]`" +) +MSfinance2_counts = df[filter]["MSfinance2"].value_counts() +st.plotly_chart( + render_pie_chart( + MSfinance2_counts, + values=MSfinance2_counts, + names=MSfinance2_counts.index, + color=MSfinance2_counts.index, + color_discrete_map={ + "No": px.colors.qualitative.Prism[8], + "Yes": px.colors.qualitative.Prism[2], + "I don't know": px.colors.qualitative.Prism[10], + "I prefer not to say": px.colors.qualitative.Prism[10], + }, + ) +) + +st.write("## Freedom of Information") + +st.write( + "### Have you requested information under the national Freedom of Information Law  when you worked on intelligence-related issues over the past 5 years? `[MSfoi1]`" +) +MSfoi1_counts = df[filter]["MSfoi1"].value_counts() +st.plotly_chart( + render_pie_chart( + MSfoi1_counts, + values=MSfoi1_counts, + names=MSfoi1_counts.index, + color=MSfoi1_counts.index, + color_discrete_map={ + "No": px.colors.qualitative.Prism[8], + "Yes": px.colors.qualitative.Prism[2], + "I don't know": px.colors.qualitative.Prism[10], + "I prefer not to say": px.colors.qualitative.Prism[10], + }, + ) +) + +st.write("### How often did you request information? `[MSfoi2]`") +st.plotly_chart( + render_histogram( + df[filter], + x="MSfoi2", + y=None, + nbins=10, + color="country", + color_discrete_map={ + "Germany": px.colors.qualitative.Prism[5], + "France": px.colors.qualitative.Prism[1], + "United Kingdom": px.colors.qualitative.Prism[7], + }, + labels={"MSfoi2": "Number of requests"}, + ) +) + +st.write( + "### Over the past 5 years, did you receive a response to your FOI request(s) in a timely manner? `[MSfoi3]`" +) +MSfoi3_counts = df[filter]["MSfoi3"].value_counts() +st.plotly_chart( + render_pie_chart( + MSfoi3_counts, + values=MSfoi3_counts, + names=MSfoi3_counts.index, + color=MSfoi3_counts.index, + color_discrete_map={ + "Never": px.colors.qualitative.Prism[9], + "No, usually longer than 30 days": px.colors.qualitative.Prism[8], + "Yes, within 30 days": px.colors.qualitative.Prism[2], + "I don't know": px.colors.qualitative.Prism[10], + "I prefer not to say": px.colors.qualitative.Prism[10], + }, + ) +) + +st.write( + "### How helpful have FOI requests been for your work on intelligence-related issues? `[MSfoi4]`" +) +protectops2_counts = df[filter]["MSfoi4"].value_counts() + +st.plotly_chart( + render_pie_chart( + df[filter], + values=protectops2_counts, + names=protectops2_counts.index, + ) +) + +st.write( + "### Why haven’t you requested information under the national FOI law when you reported on intelligence-related issues over the past 5 years? `[MSfoi5]`" +) +MSfoi5_df = pd.DataFrame(columns=("option", "count", "country")) +# TODO Map proper labels +for label in [ + "not_aware", + "not_covered", + "too_expensive", + "too_time_consuming", + "afraid_of_data_destruction", + "afraid_of_discrimination", + "other", + "dont_know", + "prefer_not_to_say", +]: + MSfoi5_data = df[filter]["country"][df[f"MSfoi5[{label}]"] == 1].tolist() + for i in MSfoi5_data: + MSfoi5_df = MSfoi5_df.append( + {"option": label, "count": MSfoi5_data.count(i), "country": i}, + ignore_index=True, + ) +MSfoi5_df = MSfoi5_df.drop_duplicates() + +st.plotly_chart( + render_histogram( + MSfoi5_df, + x="option", + y="count", + nbins=None, + color="country", + color_discrete_map={ + "Germany": px.colors.qualitative.Prism[5], + "France": px.colors.qualitative.Prism[1], + "United Kingdom": px.colors.qualitative.Prism[7], + }, + labels={"count": "people who answered 'Yes'"}, + ) +) + +st.write("## Appreciation") + +st.write( + "### In the past 5 years, have stories on surveillance by intelligence agencies been nominated for a journalistic award in the country you primarily work in? `[MSapp1]`" +) +MSapp1_counts = df[filter]["MSapp1"].value_counts() +st.plotly_chart( + render_pie_chart( + MSapp1_counts, + values=MSapp1_counts, + names=MSapp1_counts.index, + color=MSapp1_counts.index, + color_discrete_map={ + "No": px.colors.qualitative.Prism[8], + "Yes": px.colors.qualitative.Prism[2], + "I don't know": px.colors.qualitative.Prism[10], + "I prefer not to say": px.colors.qualitative.Prism[10], + }, + ) +) + +st.write( + "### Are there specific awards in the country you primarily work in for reporting on intelligence-related topics? `[MSapp2]`" +) +MSapp2_counts = df[filter]["MSapp2"].value_counts() +st.plotly_chart( + render_pie_chart( + MSapp2_counts, + values=MSapp2_counts, + names=MSapp2_counts.index, + color=MSapp2_counts.index, + color_discrete_map={ + "No": px.colors.qualitative.Prism[8], + "Yes": px.colors.qualitative.Prism[2], + "I don't know": px.colors.qualitative.Prism[10], + "I prefer not to say": px.colors.qualitative.Prism[10], + }, + ) +) + +############################################################################### +# Appendix +############################################################################### + + +st.write("# Appendix") + +st.write("## Raw data") + +st.write(get_csv_download_link(df, "media"), unsafe_allow_html=True) +st.write(get_excel_download_link(df, "media"), unsafe_allow_html=True) + +table = st.checkbox("Show data as table") +if table: + st.dataframe(df[filter]) + +st.write("## Correlation Matrix (Phik `φK`)") + +st.write( + "Phik (φk) is a new and practical correlation coefficient that works consistently between categorical, ordinal and interval variables, captures non-linear dependency and reverts to the Pearson correlation coefficient in case of a bivariate normal input distribution. There is extensive documentation available [here](https://phik.readthedocs.io/en/latest/index.html)" +) + +show_corr = st.checkbox("Show correlation matrix") +if show_corr: + fig_corr = get_corr_matrix(df) + st.plotly_chart(fig_corr, use_container_width=True) -fig_corr = get_corr_matrix(df) -st.plotly_chart(fig_corr, use_container_width=True) +st.write("## Significance Matrix") -st.write("# Significance Matrix") st.markdown( body="When assessing correlations it is good practise to evaluate both the correlation and the significance of the correlation: a large correlation may be statistically insignificant, and vice versa a small correlation may be very significant. For instance, scipy.stats.pearsonr returns both the pearson correlation and the p-value. Similarly, the phik package offers functionality the calculate a significance matrix. Significance is defined as: " ) @@ -993,5 +1435,7 @@ def get_significance_matrix(df): body="$Z=\Phi^{-1}(1-p); \Phi(z)=\\frac{1}{\\sqrt{2\pi}}\int_{-\infty}^{z} e^{-t^{2}/2}\,dt$" ) -fig_sig = get_significance_matrix(df) -st.plotly_chart(fig_sig, use_container_width=True) +show_sig = st.checkbox("Show significance matrix") +if show_sig: + fig_sig = get_significance_matrix(df) + st.plotly_chart(fig_sig, use_container_width=True)