From 6a2a994f3e9c98d1a07e956a4659f4836d34224a Mon Sep 17 00:00:00 2001 From: linozen Date: Tue, 16 Nov 2021 15:13:16 +0100 Subject: [PATCH] clean and refactor merged.py --- README.md | 10 +- explorer/merged.py | 943 ++++------------------------------------ scripts/clean_merged.py | 776 +++++++++++++++++++++++++++++++++ 3 files changed, 869 insertions(+), 860 deletions(-) create mode 100755 scripts/clean_merged.py diff --git a/README.md b/README.md index dc841d1..a6865be 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,7 @@ -# IOI Survey Data +# GUARDINT Survey Data Wrangling, exploring and visualising the GUARDINT survey data -TODO: Fill out this long description. - ## Table of Contents - [Install](#install) @@ -28,12 +26,6 @@ TODO: Fill out this long description. [@linozen](https://github.com/linozen) -## Contributing - -PRs accepted. - -Small note: If editing the README, please conform to the [standard-readme](https://github.com/RichardLitt/standard-readme) specification. - ## License MIT © 2021 Stiftung Neue Verantworung e.V. diff --git a/explorer/merged.py b/explorer/merged.py index ece57d2..5830b22 100644 --- a/explorer/merged.py +++ b/explorer/merged.py @@ -5,11 +5,8 @@ import plotly.graph_objects as go import plotly.express as px from pathlib import Path -import time from lib.figures import ( - generate_pie_chart, - generate_boxplot, generate_histogram, generate_overlaid_histogram, generate_stacked_bar_chart, @@ -28,35 +25,67 @@ @st.cache -def render_pie_chart( - df, - values, - names, - color=None, - color_discrete_sequence=px.colors.qualitative.Prism, - hover_data=None, - custom_data=None, - color_discrete_map=None, - hover_name=None, - labels=None, +def gen_px_pie( + df, values, names, color_discrete_sequence=px.colors.qualitative.Prism, **kwargs ): - return generate_pie_chart( + fig = px.pie( df, - values, - names, - hover_name, - color, - hover_data, - custom_data, - color_discrete_sequence, - color_discrete_map, - labels, + values=values, + names=names, + color_discrete_sequence=color_discrete_sequence, + color=kwargs.get("color", None), + color_discrete_map=kwargs.get("color_discrete_map", None), + custom_data=kwargs.get("custom_data", None), + ) + # Update what is shown on the slices (on hover) + fig.update_traces( + textinfo="percent+value", + hovertemplate="""Answer %{label} +

given by %{value} respondents or %{percent} +
of all who answered the question +
given the current filter setup. + """, + ) + # Update layout + fig.update_layout( + autosize=False, + width=700, + height=400, + margin=dict(l=0, r=0, b=30, t=0), + legend={ + "font": {"size": kwargs.get("legend_font_size", 14)}, + "orientation": "h", + "bgcolor": "#efefef", + "x": 0.00, + "y": 1.10, + }, ) + return fig + + +px_pie_config = { + "displaylogo": True, + "modeBarButtonsToRemove": ["hoverClosestPie"], + "watermark": True, + "toImageButtonOptions": { + "width": 700, + "height": 450, + "scale": (210 / 25.4) / (700 / 300), + }, +} @st.cache -def render_boxplot(df, x, y, color=None, points="all", color_discrete_map=None): - return generate_boxplot(df, x, y, points, color, color_discrete_map) +def gen_px_box(df, x, y, points, color, color_discrete_map): + fig = px.box( + df, + x=x, + y=y, + points=points, + color=color, + color_discrete_map=color_discrete_map, + ) + return fig @st.cache @@ -98,803 +127,21 @@ def get_significance_matrix(df): return fig -################################################################################### -# General configuration -################################################################################### +# =========================================================================== +# Import data +# =========================================================================== + +df = pd.read_pickle("data/merged.pkl") +# =========================================================================== +# General configuration +# =========================================================================== st.set_page_config( page_title="IOI Survey Data Explorer", ) -################################################################################### -# Data wrangling -################################################################################### - - -@st.cache -def get_merged_cs_df(): - - # Merge CSV files into DataFrame - cs_csv_files = [ - "data/cs_uk_short.csv", - "data/cs_de_short.csv", - "data/cs_fr_short.csv", - ] - df_list = [] - for csv in cs_csv_files: - df_list.append(pd.read_csv(csv, sep=";")) - df = pd.concat(df_list) - - # Rename columns - df = df.rename( - columns={ - "startlanguage": "XXcountry", - "lastpage": "XXlastpage", - "CSfoi5[SQ01]": "CSfoi5[not_aware]", - "CSfoi5[SQ02]": "CSfoi5[not_covered]", - "CSfoi5[SQ03]": "CSfoi5[too_expensive]", - "CSfoi5[SQ04]": "CSfoi5[too_time_consuming]", - "CSfoi5[SQ05]": "CSfoi5[afraid_of_data_destruction]", - "CSfoi5[SQ06]": "CSfoi5[afraid_of_discrimination]", - "CSfoi5[SQ07]": "CSfoi5[other]", - "CSfoi5[SQ08]": "CSfoi5[dont_know]", - "CSfoi5[SQ09]": "CSfoi5[prefer_not_to_say]", - "CSprotectops1[SQ01]": "CSprotectops1[sectraining]", - "CSprotectops1[SQ02]": "CSprotectops1[e2e]", - "CSprotectops3[SQ01]": "CSprotectops3[encrypted_email]", - "CSprotectops3[SQ02]": "CSprotectops3[vpn]", - "CSprotectops3[SQ03]": "CSprotectops3[tor]", - "CSprotectops3[SQ04]": "CSprotectops3[e2e_chat]", - "CSprotectops3[SQ05]": "CSprotectops3[encrypted_hardware]", - "CSprotectops3[SQ06]": "CSprotectops3[2fa]", - "CSprotectops3[SQ07]": "CSprotectops3[other]", - "CSprotectleg3[SQ01]": "CSprotectleg3[free_counsel]", - "CSprotectleg3[SQ02]": "CSprotectleg3[cost_insurance]", - "CSprotectleg3[SQ03]": "CSprotectleg3[other]", - "CSconstraintinter4[SQ01]": "CSconstraintinter4[police_search]", - "CSconstraintinter4[SQ02]": "CSconstraintinter4[seizure]", - "CSconstraintinter4[SQ03]": "CSconstraintinter4[extortion]", - "CSconstraintinter4[SQ04]": "CSconstraintinter4[violent_threat]", - "CSconstraintinter4[SQ05]": "CSconstraintinter4[inspection_during_travel]", - "CSconstraintinter4[SQ06]": "CSconstraintinter4[detention]", - "CSconstraintinter4[SQ07]": "CSconstraintinter4[surveillance_signalling]", - "CSconstraintinter4[SQ08]": "CSconstraintinter4[online_harassment]", - "CSconstraintinter4[SQ09]": "CSconstraintinter4[entry_on_deny_lists]", - "CSconstraintinter4[SQ10]": "CSconstraintinter4[exclusion_from_events]", - "CSconstraintinter4[SQ11]": "CSconstraintinter4[public_defamation]", - "CSconstraintinter5[SQ01]": "CSconstraintinter5[unsolicited_information]", - "CSconstraintinter5[SQ02]": "CSconstraintinter5[invitations]", - "CSconstraintinter5[SQ03]": "CSconstraintinter5[other]", - "CSconstraintinter5ot": "CSconstraintinter5other", - "CSconstraintinter6[SQ01]": "CSconstraintinter6[gender]", - "CSconstraintinter6[SQ02]": "CSconstraintinter6[ethnicity]", - "CSconstraintinter6[SQ03]": "CSconstraintinter6[political]", - "CSconstraintinter6[SQ04]": "CSconstraintinter6[sexual]", - "CSconstraintinter6[SQ05]": "CSconstraintinter6[religious]", - "CSconstraintinter6[SQ06]": "CSconstraintinter6[other]", - "CSconstraintinter6ot": "CSconstraintinter6other", - "CSattitude3[SQ01]": "CSattitude3[rule_of_law]", - "CSattitude3[SQ02]": "CSattitude3[civil_liberties]", - "CSattitude3[SQ03]": "CSattitude3[effectiveness_of_intel]", - "CSattitude3[SQ04]": "CSattitude3[legitimacy_of_intel]", - "CSattitude3[SQ05]": "CSattitude3[trust_in_intel]", - "CSattitude3[SQ06]": "CSattitude3[critique_of_intel]", - "CSattitude3[SQ08]": "CSattitude3[prefer_not_to_say]", - } - ) - df = df.replace(to_replace=r"en", value="United Kingdom") - df = df.replace(to_replace=r"de", value="Germany") - df = df.replace(to_replace=r"fr", value="France") - - # Drop (very) incomplete surveys - df = df[df["XXlastpage"] > 2] - - # Drop all but the columns needed for the analysis - df = df[ - [ - "XXcountry", - "XXlastpage", - "CShr1", - "CShr2", - "CSgender", - "CSexpertise1", - "CSexpertise2", - "CSexpertise3", - "CSexpertise4", - "CSfinance1", - "CSfoi1", - "CSfoi2", - "CSfoi3", - "CSfoi4", - "CSfoi5[not_aware]", - "CSfoi5[not_covered]", - "CSfoi5[too_expensive]", - "CSfoi5[too_time_consuming]", - "CSfoi5[afraid_of_data_destruction]", - "CSfoi5[afraid_of_discrimination]", - "CSfoi5[other]", - "CSfoi5[dont_know]", - "CSfoi5[prefer_not_to_say]", - "CSfoi5other", - "CSprotectops1[sectraining]", - "CSprotectops1[e2e]", - "CSprotectops2", - "CSprotectops3[encrypted_email]", - "CSprotectops3[vpn]", - "CSprotectops3[tor]", - "CSprotectops3[e2e_chat]", - "CSprotectops3[encrypted_hardware]", - "CSprotectops3[2fa]", - "CSprotectops3[other]", - "CSprotectops3other", - "CSprotectops4", - "CSprotectleg1", - "CSprotectleg2", - "CSprotectleg2no", - "CSprotectleg3[free_counsel]", - "CSprotectleg3[cost_insurance]", - "CSprotectleg3[other]", - "CSprotectleg3other", - "CSconstraintinter1", - "CSconstraintinter2", - "CSconstraintinter3", - "CSconstraintinter4[police_search]", - "CSconstraintinter4[seizure]", - "CSconstraintinter4[extortion]", - "CSconstraintinter4[violent_threat]", - "CSconstraintinter4[inspection_during_travel]", - "CSconstraintinter4[detention]", - "CSconstraintinter4[surveillance_signalling]", - "CSconstraintinter4[online_harassment]", - "CSconstraintinter4[entry_on_deny_lists]", - "CSconstraintinter4[exclusion_from_events]", - "CSconstraintinter4[public_defamation]", - "CSconstraintinter5[unsolicited_information]", - "CSconstraintinter5[invitations]", - "CSconstraintinter5[other]", - "CSconstraintinter5other", - "CSconstraintinter6[gender]", - "CSconstraintinter6[ethnicity]", - "CSconstraintinter6[political]", - "CSconstraintinter6[sexual]", - "CSconstraintinter6[religious]", - "CSconstraintinter6[other]", - "CSconstraintinter6other", - "CSattitude1", - "CSattitude2", - "CSattitude3[rule_of_law]", - "CSattitude3[civil_liberties]", - "CSattitude3[effectiveness_of_intel]", - "CSattitude3[legitimacy_of_intel]", - "CSattitude3[trust_in_intel]", - "CSattitude3[critique_of_intel]", - "CSattitude3[prefer_not_to_say]", - "CSattitude4[1]", - "CSattitude4[2]", - "CSattitude4[3]", - "CSattitude4[4]", - "CSattitude4[5]", - "CSattitude4[6]", - "CSattitude5[1]", - "CSattitude5[2]", - "CSattitude5[3]", - "CSattitude5[4]", - "CSattitude5[5]", - "CSattitude5[6]", - "CSattitude6[1]", - "CSattitude6[2]", - "CSattitude6[3]", - "CSattitude6[4]", - "CSattitude6[5]", - "CSattitude6[6]", - ] - ] - - # Make column names compatible - df.columns = df.columns.str[2:] - - # Set surveytype - df["surveytype"] = "Civil Society Scrutiny" - return df - - -@st.cache -def get_merged_ms_df(): - # Merge CSV files into DataFrame - ms_csv_files = [ - "data/ms_uk_short.csv", - "data/ms_de_short.csv", - "data/ms_fr_short.csv", - ] - df_list = [] - for csv in ms_csv_files: - df_list.append(pd.read_csv(csv, sep=";")) - df = pd.concat(df_list) - - # Rename columns - df = df.rename( - columns={ - "startlanguage": "XXcountry", - "lastpage": "XXlastpage", - "MFfoi2": "MSfoi2", - "MSfoi5[SQ01]": "MSfoi5[not_aware]", - "MSfoi5[SQ02]": "MSfoi5[not_covered]", - "MSfoi5[SQ03]": "MSfoi5[too_expensive]", - "MSfoi5[SQ04]": "MSfoi5[too_time_consuming]", - "MSfoi5[SQ05]": "MSfoi5[afraid_of_data_destruction]", - "MSfoi5[SQ06]": "MSfoi5[afraid_of_discrimination]", - "MSfoi5[SQ07]": "MSfoi5[other]", - "MSfoi5[SQ08]": "MSfoi5[dont_know]", - "MSfoi5[SQ09]": "MSfoi5[prefer_not_to_say]", - "MSfoi5specify": "MSfoi5other", - "MScontstraintinter1": "MSconstraintinter1", - "MSprotectleg2A": "MSprotectleg2", - "MSprotectleg2Ano": "MSprotectleg2no", - "MSprotectops1[SQ01]": "MSprotectops1[sectraining]", - "MSprotectops1[SQ03]": "MSprotectops1[e2e]", - "MSprotectops3[SQ01]": "MSprotectops3[encrypted_email]", - "MSprotectops3[SQ02]": "MSprotectops3[vpn]", - "MSprotectops3[SQ03]": "MSprotectops3[tor]", - "MSprotectops3[SQ04]": "MSprotectops3[e2e_chat]", - "MSprotectops3[SQ05]": "MSprotectops3[encrypted_hardware]", - "MSprotectops3[SQ06]": "MSprotectops3[2fa]", - "MSprotectops3[SQ08]": "MSprotectops3[other]", - "MSprotectleg3[SQ01]": "MSprotectleg3[free_counsel]", - "MSprotectleg3[SQ02]": "MSprotectleg3[cost_insurance]", - "MSprotectleg3[SQ03]": "MSprotectleg3[other]", - "MSconstraintinter4[SQ01]": "MSconstraintinter4[police_search]", - "MSconstraintinter4[SQ02]": "MSconstraintinter4[seizure]", - "MSconstraintinter4[SQ03]": "MSconstraintinter4[extortion]", - "MSconstraintinter4[SQ04]": "MSconstraintinter4[violent_threat]", - "MSconstraintinter4[SQ05]": "MSconstraintinter4[inspection_during_travel]", - "MSconstraintinter4[SQ06]": "MSconstraintinter4[detention]", - "MSconstraintinter4[SQ07]": "MSconstraintinter4[surveillance_signalling]", - "MSconstraintinter4[SQ08]": "MSconstraintinter4[online_harassment]", - "MSconstraintinter4[SQ09]": "MSconstraintinter4[entry_on_deny_lists]", - "MSconstraintinter4[SQ10]": "MSconstraintinter4[exclusion_from_events]", - "MSconstraintinter4[SQ11]": "MSconstraintinter4[public_defamation]", - "MSconstraintinter5[SQ01]": "MSconstraintinter5[unsolicited_information]", - "MSconstraintinter5[SQ02]": "MSconstraintinter5[invitations]", - "MSconstraintinter5[SQ03]": "MSconstraintinter5[other]", - "MSconstraintinter5ot": "MSconstraintinter5other", - "MSconstraintinter6[SQ01]": "MSconstraintinter6[gender]", - "MSconstraintinter6[SQ02]": "MSconstraintinter6[ethnicity]", - "MSconstraintinter6[SQ03]": "MSconstraintinter6[political]", - "MSconstraintinter6[SQ04]": "MSconstraintinter6[sexual]", - "MSconstraintinter6[SQ05]": "MSconstraintinter6[religious]", - "MSconstraintinter6[SQ06]": "MSconstraintinter6[other]", - "MSconstraintinter6ot": "MSconstraintinter6other", - "MSattitude3[SQ01]": "MSattitude3[rule_of_law]", - "MSattitude3[SQ02]": "MSattitude3[civil_liberties]", - "MSattitude3[SQ03]": "MSattitude3[effectiveness_of_intel]", - "MSattitude3[SQ04]": "MSattitude3[legitimacy_of_intel]", - "MSattitude3[SQ05]": "MSattitude3[trust_in_intel]", - "MSattitude3[SQ06]": "MSattitude3[critique_of_intel]", - "MSattitude3[SQ07]": "MSattitude3[prefer_not_to_say]", - } - ) - df = df.replace(to_replace=r"en", value="United Kingdom") - df = df.replace(to_replace=r"de", value="Germany") - df = df.replace(to_replace=r"fr", value="France") - - # Drop (very) incomplete surveys - df = df[df["XXlastpage"] > 2] - - # Drop all but the columns needed for the analysis - df = df[ - [ - "XXcountry", - "XXlastpage", - "MShr1", - "MShr2", - "MSgender", - "MSexpertise1", - "MSexpertise2", - "MSexpertise3", - "MSexpertise4", - "MSfinance1", - "MSfoi1", - "MSfoi2", - "MSfoi3", - "MSfoi4", - "MSfoi5[not_aware]", - "MSfoi5[not_covered]", - "MSfoi5[too_expensive]", - "MSfoi5[too_time_consuming]", - "MSfoi5[afraid_of_data_destruction]", - "MSfoi5[afraid_of_discrimination]", - "MSfoi5[other]", - "MSfoi5[dont_know]", - "MSfoi5[prefer_not_to_say]", - "MSfoi5other", - "MSprotectops1[sectraining]", - "MSprotectops1[e2e]", - "MSprotectops2", - "MSprotectops3[encrypted_email]", - "MSprotectops3[vpn]", - "MSprotectops3[tor]", - "MSprotectops3[e2e_chat]", - "MSprotectops3[encrypted_hardware]", - "MSprotectops3[2fa]", - "MSprotectops3[other]", - "MSprotectops3other", - "MSprotectops4", - "MSprotectleg1", - "MSprotectleg2", - "MSprotectleg2no", - "MSprotectleg3[free_counsel]", - "MSprotectleg3[cost_insurance]", - "MSprotectleg3[other]", - "MSprotectleg3other", - "MSconstraintinter1", - "MSconstraintinter2", - "MSconstraintinter3", - "MSconstraintinter4[police_search]", - "MSconstraintinter4[seizure]", - "MSconstraintinter4[extortion]", - "MSconstraintinter4[violent_threat]", - "MSconstraintinter4[inspection_during_travel]", - "MSconstraintinter4[detention]", - "MSconstraintinter4[surveillance_signalling]", - "MSconstraintinter4[online_harassment]", - "MSconstraintinter4[entry_on_deny_lists]", - "MSconstraintinter4[exclusion_from_events]", - "MSconstraintinter4[public_defamation]", - "MSconstraintinter5[unsolicited_information]", - "MSconstraintinter5[invitations]", - "MSconstraintinter5[other]", - "MSconstraintinter5other", - "MSconstraintinter6[gender]", - "MSconstraintinter6[ethnicity]", - "MSconstraintinter6[political]", - "MSconstraintinter6[sexual]", - "MSconstraintinter6[religious]", - "MSconstraintinter6[other]", - "MSconstraintinter6other", - "MSattitude1", - "MSattitude2", - "MSattitude3[rule_of_law]", - "MSattitude3[civil_liberties]", - "MSattitude3[effectiveness_of_intel]", - "MSattitude3[legitimacy_of_intel]", - "MSattitude3[trust_in_intel]", - "MSattitude3[critique_of_intel]", - "MSattitude3[prefer_not_to_say]", - "MSattitude4[1]", - "MSattitude4[2]", - "MSattitude4[3]", - "MSattitude4[4]", - "MSattitude4[5]", - "MSattitude4[6]", - "MSattitude5[1]", - "MSattitude5[2]", - "MSattitude5[3]", - "MSattitude5[4]", - "MSattitude5[5]", - "MSattitude5[6]", - "MSattitude6[1]", - "MSattitude6[2]", - "MSattitude6[3]", - "MSattitude6[4]", - "MSattitude6[5]", - "MSattitude6[6]", - ] - ] - - # Make column names compatible - df.columns = df.columns.str[2:] - - # Set surveytype - df["surveytype"] = "Media Scrutiny" - return df - - -############################################################################### -# Define base DataFrame (Merge CS with MS) -############################################################################### - - -df_cs = get_merged_cs_df() -df_ms = get_merged_ms_df() -df = pd.concat([df_cs, df_ms], ignore_index=True) - - -############################################################################### -# Make answers human-readable -############################################################################### - - -# Helper variables needed when answers are coded differently in the respective -# survey types or languages -is_civsoc = df.surveytype == "Civil Society Scrutiny" -is_media = df.surveytype == "Media Scrutiny" -is_de = df.country == "Germany" -is_uk = df.country == "United Kingdom" -is_fr = df.country == "France" - -df.loc[is_civsoc, "hr1"] = df["hr1"].replace( - { - "AO01": "Full-time", - "AO02": "Part-time (>50%)", - "AO03": "Part-time (<50%)", - "AO04": "Freelance", - "AO05": "Unpaid", - "AO06": "Other", - "AO07": "I don't know", - "AO08": "I prefer not to say", - } -) -df.loc[is_media, "hr1"] = df["hr1"].replace( - { - "AO01": "Full-time", - "AO02": "Part-time (>50%)", - "AO03": "Part-time (<50%)", - "AO04": "Freelance", - "AO05": "Unpaid", - "AO08": "Other", - "AO06": "I don't know", - "AO07": "I prefer not to say", - } -) - -df["gender"] = df["gender"].fillna("Not specified") -df["gender"] = df["gender"].replace( - { - "AO01": "Female", - "AO02": "Non-binary", - "AO03": "Male", - "AO04": "I prefer not to say", - "AO05": "Other", - } -) - -df["expertise2"] = df["expertise2"].replace( - { - "AO01": "Expert knowledge", - "AO02": "Advanced knowledge", - "AO03": "Some knowledge", - "AO04": "Basic knowledge", - "AO05": "No knowledge", - "AO06": "I don't know", - "AO07": "I prefer not to say", - } -) - -df["expertise3"] = df["expertise3"].replace( - { - "AO01": "Expert knowledge", - "AO02": "Advanced knowledge", - "AO03": "Some knowledge", - "AO04": "Basic knowledge", - "AO05": "No knowledge", - "AO06": "I don't know", - "AO07": "I prefer not to say", - } -) - -df["expertise4"] = df["expertise4"].replace( - { - "AO01": "Expert knowledge", - "AO02": "Advanced knowledge", - "AO03": "Some knowledge", - "AO04": "Basic knowledge", - "AO05": "No knowledge", - "AO06": "I don't know", - "AO07": "I prefer not to say", - } -) - -df["finance1"] = df["finance1"].replace( - { - "AO01": "A great deal of funding", - "AO02": "Sufficient funding", - "AO03": "Some funding", - "AO04": "Little funding", - "AO05": "No funding", - "AO06": "I don't know", - "AO07": "I prefer not to say", - } -) - -df["foi1"] = df["foi1"].replace( - { - "AO01": "Yes", - "AO02": "No", - "AO03": "I don't know", - "AO04": "I prefer not to say", - } -) - -df["foi3"] = df["foi3"].replace( - { - "AO01": "Yes, within 30 days", - "AO02": "No, usually longer than 30 days", - "AO03": "Never", - "AO04": "I don't know", - "AO05": "I prefere not to say", - } -) - -df.loc[is_civsoc, "foi4"] = df["foi4"].replace( - { - "AO01": "Very helpful", - "AO03": "Helpful in parts", - "AO05": "Not helpful at all", - "AO06": "I don't know", - "AO07": "I prefer not to say", - } -) - -df.loc[is_media, "foi4"] = df["foi4"].replace( - { - "AO01": "Very helpful", - "AO02": "Helpful in parts", - "AO03": "Not helpful at all", - "AO06": "I don't know", - "AO07": "I prefer not to say", - } -) - -df["protectops1[sectraining]"] = df["protectops1[sectraining]"].replace( - { - "AO01": "Yes", - "AO02": "No", - "AO03": "I don't know", - "AO04": "I prefer not to say", - } -) - -df["protectops1[e2e]"] = df["protectops1[e2e]"].replace( - { - "AO01": "Yes", - "AO02": "No", - "AO03": "I don't know", - "AO04": "I prefer not to say", - } -) - -df["protectops2"] = df["protectops2"].replace( - { - "AO01": "Yes", - "AO02": "No", - "AO03": "I don't know", - "AO04": "I prefer not to say", - } -) - -protectops3_options = [ - "encrypted_email", - "vpn", - "tor", - "e2e_chat", - "encrypted_hardware", - "2fa", - "other", -] -for label in protectops3_options: - df[f"protectops3[{label}]"] = df[f"protectops3[{label}]"].replace( - { - "AO01": "Very important", - "AO02": "Important", - "AO03": "Somewhat important", - "AO04": "Slightly important", - "AO05": "Not important at all", - "AO06": "I don't know", - "AO07": "I prefer not to say", - } - ) -# this was hard to spot. Only the CS survey for DE was coded as below -for label in protectops3_options: - df.loc[(is_civsoc) & (is_de), f"protectops3[{label}]"] = df[ - f"protectops3[{label}]" - ].replace( - { - "AO01": "Very important", - "AO02": "Important", - "AO03": "Somewhat important", - "AO04": "Slightly important", - "AO05": "Not important at all", - # notice the AO09 instead of AO06 as above - "AO09": "I don't know", - "AO11": "I prefer not to say", - } - ) - -df["protectops4"] = df["protectops4"].replace( - { - "AO01": "I have full confidence that the right tools
will protect my communication from surveillance", - "AO02": "Technological tools help to protect my identity
to some extent, but an attacker with sufficient power
may eventually be able to bypass my technological
safeguards", - "AO03": "Under the current conditions of communications
surveillance, technological solutions cannot offer
sufficient protection for the data I handle", - "AO04": "I have no confidence in the protection offered by
technological tools", - "AO05": "I try to avoid technology-based communication whenever
possible when I work on intelligence-related issues", - "AO06": "I don't know", - "AO07": "I prefer not to say", - } -) - -df["protectleg1"] = df["protectleg1"].replace( - { - "AO01": "Always", - "AO02": "Often", - "AO03": "Sometimes", - "AO04": "Rarely", - "AO05": "Never", - "AO06": "I don't know", - "AO07": "I prefer not to say", - } -) - -df["protectleg2"] = df["protectleg2"].replace( - { - "AO01": "Yes", - "AO02": "No", - "AO03": "I don't know", - "AO04": "I prefer not to say", - } -) - -for label in ["free_counsel", "cost_insurance", "other"]: - df[f"protectleg3[{label}]"] = df[f"protectleg3[{label}]"].replace( - { - "AO01": "Yes", - "AO02": "No", - "AO03": "I don't know", - "AO04": "I prefer not to say", - } - ) - -df["constraintinter1"] = df["constraintinter1"].replace( - { - "AO01": "Yes, I have evidence", - "AO02": "Yes, I suspect", - "AO03": "No", - "AO04": "I don't know", - "AO05": "I prefer not to say", - } -) - -df["constraintinter2"] = df["constraintinter2"].replace( - { - "AO01": "Yes", - "AO02": "No", - "AO03": "I don't know", - "AO04": "I prefer not to say", - } -) - -df["constraintinter3"] = df["constraintinter3"].replace( - { - "AO01": "I was threatened with prosecution", - "AO02": "I was prosecuted but acquitted", - "AO03": "I was prosecuted and convicted", - "AO04": "I don't know", - "AO05": "I prefer not to say", - } -) - -constraintinter4_options = [ - "police_search", - "seizure", - "extortion", - "violent_threat", - "inspection_during_travel", - "detention", - "surveillance_signalling", - "online_harassment", - "entry_on_deny_lists", - "exclusion_from_events", - "public_defamation", -] - -for label in constraintinter4_options: - df[f"constraintinter4[{label}]"] = df[f"constraintinter4[{label}]"].replace( - { - "AO01": "Yes", - "AO02": "No", - "AO03": "I don't know", - "AO04": "I prefer not to say", - } - ) - -for label in ["unsolicited_information", "invitations", "other"]: - df[f"constraintinter5[{label}]"] = df[f"constraintinter5[{label}]"].replace( - { - "AO01": "Yes", - "AO02": "No", - "AO03": "I don't know", - "AO04": "I prefer not to say", - } - ) - -for label in ["gender", "ethnicity", "political", "sexual", "religious", "other"]: - df[f"constraintinter6[{label}]"] = df[f"constraintinter6[{label}]"].replace( - { - "AO01": "Yes", - "AO02": "No", - "AO03": "I don't know", - "AO04": "I prefer not to say", - } - ) - -df["attitude1"] = df["attitude1"].replace( - { - "AO01": "Intelligence agencies are incompatible with democratic
values and should be abolished", - "AO02": "Intelligence agencies contradict democratic principles,
and their powers should be kept at a bare minimum", - "AO03": "Intelligence agencies are necessary and legitimate institutions
of democratic states, even though they may sometimes overstep
their legal mandates", - "AO04": "Intelligence agencies are a vital component of national
security and should be shielded from excessive bureaucratic
restrictions", - "AO05": "I prefer not to say", - } -) - -df["attitude2"] = df["attitude2"].replace( - { - "AO01": "Intelligence oversight generally succeeds in uncovering
past misconduct and preventing future misconduct", - "AO02": "Intelligence oversight is mostly effective, however its
institutional design needs reform for oversight practitioners
to reliably uncover past misconduct and prevent future
misconduct", - "AO03": "Intelligence oversight lacks efficacy, hence a fundamental
reorganization of oversight capacity is needed for oversight
practitioners to reliably uncover past misconduct and
prevent future misconduct", - "AO04": "Effective intelligence oversight is a hopeless endeavour
and even a systematic reorganization is unlikely to ensure
misconduct is uncovered and prevented.", - "AO05": "I prefer not to say", - } -) - -for i in range(4, 7): - for j in range(1, 7): - df[f"attitude{i}[{j}]"] = df[f"attitude{i}[{j}]"].replace( - { - "AO01": "Parliamentary oversight bodies", - "AO02": "Judicial oversight bodies", - "AO03": "Independent expert bodies", - "AO04": "Data protection authorities", - "AO05": "Audit courts", - "AO06": "CSOs | The media", - } - ) - # Here, CS FR is coded differently - df.loc[(is_fr) & (is_civsoc), f"attitude{i}[{j}]"] = df[ - f"attitude{i}[{j}]" - ].replace( - { - "AO01": "Parliamentary oversight bodies", - "AO02": "Judicial oversight bodies", - "AO03": "Independent expert bodies", - "AO04": "Data protection authorities", - "AO07": "Audit courts", - "AO06": "CSOs | The media", - } - ) - -############################################################################### -# Make answers analysable (change data types etc.) -############################################################################### - - -df["hr2"] = df["hr2"].replace("?", np.nan) -df["hr2"] = df["hr2"].replace("0,5", 0.5) -df["hr2"] = pd.to_numeric(df["hr2"], errors="coerce") - -df["expertise1"] = df["expertise1"].replace("?", np.nan) -df["expertise1"] = df["expertise1"].replace("<1", 0.5) -df["expertise1"] = pd.to_numeric(df["expertise1"], errors="coerce") - -df["foi2"] = df["foi2"].replace( - {"20+": 20.0, " ca 10": 10.0, "several": 3.0, "15+": 15.0} -) -df["foi2"] = pd.to_numeric(df["foi2"], errors="coerce") - -# Here, I change the datatype to boolean for all the multiple choice answers -for col in df: - if col.startswith("foi5[") or col.startswith("attitude3"): - df[col] = df[col].replace(np.nan, False) - df[col] = df[col].replace("Y", True) - df[col] = df[col].astype("bool") - - -############################################################################### -# Sidebar | Filter logic -############################################################################### - - def callback(): st.experimental_set_query_params(section=st.session_state.section) @@ -951,17 +198,6 @@ def callback(): else: filter = filter & (df[column_name] == selectbox) - -############################################################################### -# Save a useful snapshot of the merged data (comment out for production) -############################################################################### - - -# df.to_pickle("./data/merged.pkl") -# df.to_excel("./data/merged.xlsx") -# df.to_csv("./data/merged.csv") - - ############################################################################### # Custom JS/CSS ############################################################################### @@ -1025,10 +261,14 @@ def callback(): "†For the calculation of the mean, only valid numerical answers were counted. This is why the number might differ from the number one gets when simply dividing e.g. the cumulative years spent working on SBIA by the overall number of respondents (including those who haven't specified their experience in years)." ) - st.write("### Country `[country]`") country_counts = df[filter]["country"].value_counts() + st.write("### Country") + country_total = country_counts.sum() + st.write( + f"_**{country_total}** respondents in total answered the question with the current filter._" + ) st.plotly_chart( - render_pie_chart( + gen_px_pie( df[filter], values=country_counts, names=country_counts.index, @@ -1040,12 +280,13 @@ def callback(): }, ), use_container_width=True, + config=px_pie_config, ) st.write("### Surveytype `[surveytype]`") surveytype_counts = df[filter]["surveytype"].value_counts() st.plotly_chart( - render_pie_chart( + gen_px_pie( df[filter], values=surveytype_counts, names=surveytype_counts.index, @@ -1056,7 +297,7 @@ def callback(): st.write("### Gender `[gender]`") gender_counts = df[filter]["gender"].value_counts() st.plotly_chart( - render_pie_chart( + gen_px_pie( df[filter], values=gender_counts, names=gender_counts.index, @@ -1079,7 +320,7 @@ def callback(): st.write("### What is your employment status `[hr1]`") hr1_counts = df[filter]["hr1"].value_counts() st.plotly_chart( - render_pie_chart( + gen_px_pie( hr1_counts, values=hr1_counts, names=hr1_counts.index, @@ -1116,7 +357,7 @@ def callback(): ) st.plotly_chart( - render_boxplot( + gen_px_box( df=df[filter], points="all", x="country", @@ -1137,7 +378,7 @@ def callback(): ) hr2_more_than_five_counts = df[filter]["hr2_more_than_five"].value_counts() st.plotly_chart( - render_pie_chart( + gen_px_pie( hr2_more_than_five_counts, values=hr2_more_than_five_counts, names=hr2_more_than_five_counts.index, @@ -1169,7 +410,7 @@ def callback(): ) st.plotly_chart( - render_boxplot( + gen_px_box( df=df[filter], points="all", x="country", @@ -1189,7 +430,7 @@ def callback(): ) expertise2_counts = df[filter]["expertise2"].value_counts() st.plotly_chart( - render_pie_chart( + gen_px_pie( df[filter], values=expertise2_counts, names=expertise2_counts.index, @@ -1213,7 +454,7 @@ def callback(): ) expertise3_counts = df[filter]["expertise3"].value_counts() st.plotly_chart( - render_pie_chart( + gen_px_pie( df[filter], values=expertise3_counts, names=expertise3_counts.index, @@ -1237,7 +478,7 @@ def callback(): ) expertise4_counts = df[filter]["expertise4"].value_counts() st.plotly_chart( - render_pie_chart( + gen_px_pie( df[filter], values=expertise4_counts, names=expertise4_counts.index, @@ -1263,7 +504,7 @@ def callback(): ) finance1_counts = df[filter]["finance1"].value_counts() st.plotly_chart( - render_pie_chart( + gen_px_pie( df[filter], values=finance1_counts, names=finance1_counts.index, @@ -1288,7 +529,7 @@ def callback(): ) foi1_counts = df[filter]["foi1"].value_counts() st.plotly_chart( - render_pie_chart( + gen_px_pie( foi1_counts, values=foi1_counts, names=foi1_counts.index, @@ -1322,7 +563,7 @@ def callback(): ) st.plotly_chart( - render_boxplot( + gen_px_box( df=df[filter], points="all", x="country", @@ -1342,7 +583,7 @@ def callback(): ) foi3_counts = df[filter]["foi3"].value_counts() st.plotly_chart( - render_pie_chart( + gen_px_pie( foi3_counts, values=foi3_counts, names=foi3_counts.index, @@ -1364,7 +605,7 @@ def callback(): foi4_counts = df[filter]["foi4"].value_counts() st.plotly_chart( - render_pie_chart( + gen_px_pie( df[filter], values=foi4_counts, names=foi4_counts.index, @@ -1493,7 +734,7 @@ def callback(): ) protectops2_counts = df[filter]["protectops2"].value_counts() st.plotly_chart( - render_pie_chart( + gen_px_pie( df[filter], values=protectops2_counts, names=protectops2_counts.index, @@ -1601,7 +842,7 @@ def callback(): ) protectops4_counts = df[filter]["protectops4"].value_counts() st.plotly_chart( - render_pie_chart( + gen_px_pie( protectops4_counts, values=protectops4_counts, names=protectops4_counts.index, @@ -1638,7 +879,7 @@ def callback(): protectleg1_counts = df[filter]["protectleg1"].value_counts() st.plotly_chart( - render_pie_chart( + gen_px_pie( df[filter], values=protectleg1_counts, names=protectleg1_counts.index, @@ -1662,7 +903,7 @@ def callback(): protectleg2_counts = df[filter]["protectleg2"].value_counts() st.plotly_chart( - render_pie_chart( + gen_px_pie( protectleg2_counts, values=protectleg2_counts, names=protectleg2_counts.index, @@ -1757,7 +998,7 @@ def callback(): constraintinter1_counts = df[filter]["constraintinter1"].value_counts() st.plotly_chart( - render_pie_chart( + gen_px_pie( df[filter], values=constraintinter1_counts, names=constraintinter1_counts.index, @@ -1791,7 +1032,7 @@ def callback(): constraintinter3_counts = df[filter]["constraintinter3"].value_counts() st.plotly_chart( - render_pie_chart( + gen_px_pie( df[filter], values=constraintinter3_counts, names=constraintinter3_counts.index, @@ -2011,7 +1252,7 @@ def callback(): attitude1_counts = df[filter]["attitude1"].value_counts() st.plotly_chart( - render_pie_chart( + gen_px_pie( df[filter], values=attitude1_counts, names=attitude1_counts.index, @@ -2026,7 +1267,7 @@ def callback(): attitude2_counts = df[filter]["attitude2"].value_counts() st.plotly_chart( - render_pie_chart( + gen_px_pie( df[filter], values=attitude2_counts, names=attitude2_counts.index, diff --git a/scripts/clean_merged.py b/scripts/clean_merged.py new file mode 100755 index 0000000..82c904b --- /dev/null +++ b/scripts/clean_merged.py @@ -0,0 +1,776 @@ +#!/usr/bin/env python3 +import pandas as pd +import numpy as np + +def construct_cs_df(): + + # Merge CSV files into DataFrame + cs_csv_files = [ + "data/limesurvey/cs_uk_short.csv", + "data/limesurvey/cs_de_short.csv", + "data/limesurvey/cs_fr_short.csv", + ] + df_list = [] + for csv in cs_csv_files: + df_list.append(pd.read_csv(csv, sep=";")) + df = pd.concat(df_list) + + # Rename columns + df = df.rename( + columns={ + "startlanguage": "XXcountry", + "lastpage": "XXlastpage", + "CSfoi5[SQ01]": "CSfoi5[not_aware]", + "CSfoi5[SQ02]": "CSfoi5[not_covered]", + "CSfoi5[SQ03]": "CSfoi5[too_expensive]", + "CSfoi5[SQ04]": "CSfoi5[too_time_consuming]", + "CSfoi5[SQ05]": "CSfoi5[afraid_of_data_destruction]", + "CSfoi5[SQ06]": "CSfoi5[afraid_of_discrimination]", + "CSfoi5[SQ07]": "CSfoi5[other]", + "CSfoi5[SQ08]": "CSfoi5[dont_know]", + "CSfoi5[SQ09]": "CSfoi5[prefer_not_to_say]", + "CSprotectops1[SQ01]": "CSprotectops1[sectraining]", + "CSprotectops1[SQ02]": "CSprotectops1[e2e]", + "CSprotectops3[SQ01]": "CSprotectops3[encrypted_email]", + "CSprotectops3[SQ02]": "CSprotectops3[vpn]", + "CSprotectops3[SQ03]": "CSprotectops3[tor]", + "CSprotectops3[SQ04]": "CSprotectops3[e2e_chat]", + "CSprotectops3[SQ05]": "CSprotectops3[encrypted_hardware]", + "CSprotectops3[SQ06]": "CSprotectops3[2fa]", + "CSprotectops3[SQ07]": "CSprotectops3[other]", + "CSprotectleg3[SQ01]": "CSprotectleg3[free_counsel]", + "CSprotectleg3[SQ02]": "CSprotectleg3[cost_insurance]", + "CSprotectleg3[SQ03]": "CSprotectleg3[other]", + "CSconstraintinter4[SQ01]": "CSconstraintinter4[police_search]", + "CSconstraintinter4[SQ02]": "CSconstraintinter4[seizure]", + "CSconstraintinter4[SQ03]": "CSconstraintinter4[extortion]", + "CSconstraintinter4[SQ04]": "CSconstraintinter4[violent_threat]", + "CSconstraintinter4[SQ05]": "CSconstraintinter4[inspection_during_travel]", + "CSconstraintinter4[SQ06]": "CSconstraintinter4[detention]", + "CSconstraintinter4[SQ07]": "CSconstraintinter4[surveillance_signalling]", + "CSconstraintinter4[SQ08]": "CSconstraintinter4[online_harassment]", + "CSconstraintinter4[SQ09]": "CSconstraintinter4[entry_on_deny_lists]", + "CSconstraintinter4[SQ10]": "CSconstraintinter4[exclusion_from_events]", + "CSconstraintinter4[SQ11]": "CSconstraintinter4[public_defamation]", + "CSconstraintinter5[SQ01]": "CSconstraintinter5[unsolicited_information]", + "CSconstraintinter5[SQ02]": "CSconstraintinter5[invitations]", + "CSconstraintinter5[SQ03]": "CSconstraintinter5[other]", + "CSconstraintinter5ot": "CSconstraintinter5other", + "CSconstraintinter6[SQ01]": "CSconstraintinter6[gender]", + "CSconstraintinter6[SQ02]": "CSconstraintinter6[ethnicity]", + "CSconstraintinter6[SQ03]": "CSconstraintinter6[political]", + "CSconstraintinter6[SQ04]": "CSconstraintinter6[sexual]", + "CSconstraintinter6[SQ05]": "CSconstraintinter6[religious]", + "CSconstraintinter6[SQ06]": "CSconstraintinter6[other]", + "CSconstraintinter6ot": "CSconstraintinter6other", + "CSattitude3[SQ01]": "CSattitude3[rule_of_law]", + "CSattitude3[SQ02]": "CSattitude3[civil_liberties]", + "CSattitude3[SQ03]": "CSattitude3[effectiveness_of_intel]", + "CSattitude3[SQ04]": "CSattitude3[legitimacy_of_intel]", + "CSattitude3[SQ05]": "CSattitude3[trust_in_intel]", + "CSattitude3[SQ06]": "CSattitude3[critique_of_intel]", + "CSattitude3[SQ08]": "CSattitude3[prefer_not_to_say]", + } + ) + df = df.replace(to_replace=r"en", value="United Kingdom") + df = df.replace(to_replace=r"de", value="Germany") + df = df.replace(to_replace=r"fr", value="France") + + # Drop (very) incomplete surveys + df = df[df["XXlastpage"] > 2] + + # Drop all but the columns needed for the analysis + df = df[ + [ + "XXcountry", + "XXlastpage", + "CShr1", + "CShr2", + "CSgender", + "CSexpertise1", + "CSexpertise2", + "CSexpertise3", + "CSexpertise4", + "CSfinance1", + "CSfoi1", + "CSfoi2", + "CSfoi3", + "CSfoi4", + "CSfoi5[not_aware]", + "CSfoi5[not_covered]", + "CSfoi5[too_expensive]", + "CSfoi5[too_time_consuming]", + "CSfoi5[afraid_of_data_destruction]", + "CSfoi5[afraid_of_discrimination]", + "CSfoi5[other]", + "CSfoi5[dont_know]", + "CSfoi5[prefer_not_to_say]", + "CSfoi5other", + "CSprotectops1[sectraining]", + "CSprotectops1[e2e]", + "CSprotectops2", + "CSprotectops3[encrypted_email]", + "CSprotectops3[vpn]", + "CSprotectops3[tor]", + "CSprotectops3[e2e_chat]", + "CSprotectops3[encrypted_hardware]", + "CSprotectops3[2fa]", + "CSprotectops3[other]", + "CSprotectops3other", + "CSprotectops4", + "CSprotectleg1", + "CSprotectleg2", + "CSprotectleg2no", + "CSprotectleg3[free_counsel]", + "CSprotectleg3[cost_insurance]", + "CSprotectleg3[other]", + "CSprotectleg3other", + "CSconstraintinter1", + "CSconstraintinter2", + "CSconstraintinter3", + "CSconstraintinter4[police_search]", + "CSconstraintinter4[seizure]", + "CSconstraintinter4[extortion]", + "CSconstraintinter4[violent_threat]", + "CSconstraintinter4[inspection_during_travel]", + "CSconstraintinter4[detention]", + "CSconstraintinter4[surveillance_signalling]", + "CSconstraintinter4[online_harassment]", + "CSconstraintinter4[entry_on_deny_lists]", + "CSconstraintinter4[exclusion_from_events]", + "CSconstraintinter4[public_defamation]", + "CSconstraintinter5[unsolicited_information]", + "CSconstraintinter5[invitations]", + "CSconstraintinter5[other]", + "CSconstraintinter5other", + "CSconstraintinter6[gender]", + "CSconstraintinter6[ethnicity]", + "CSconstraintinter6[political]", + "CSconstraintinter6[sexual]", + "CSconstraintinter6[religious]", + "CSconstraintinter6[other]", + "CSconstraintinter6other", + "CSattitude1", + "CSattitude2", + "CSattitude3[rule_of_law]", + "CSattitude3[civil_liberties]", + "CSattitude3[effectiveness_of_intel]", + "CSattitude3[legitimacy_of_intel]", + "CSattitude3[trust_in_intel]", + "CSattitude3[critique_of_intel]", + "CSattitude3[prefer_not_to_say]", + "CSattitude4[1]", + "CSattitude4[2]", + "CSattitude4[3]", + "CSattitude4[4]", + "CSattitude4[5]", + "CSattitude4[6]", + "CSattitude5[1]", + "CSattitude5[2]", + "CSattitude5[3]", + "CSattitude5[4]", + "CSattitude5[5]", + "CSattitude5[6]", + "CSattitude6[1]", + "CSattitude6[2]", + "CSattitude6[3]", + "CSattitude6[4]", + "CSattitude6[5]", + "CSattitude6[6]", + ] + ] + + # Make column names compatible + df.columns = df.columns.str[2:] + + # Set surveytype + df["surveytype"] = "Civil Society Scrutiny" + return df + + +def construct_ms_df(): + # Merge CSV files into DataFrame + ms_csv_files = [ + "data/limesurvey/ms_uk_short.csv", + "data/limesurvey/ms_de_short.csv", + "data/limesurvey/ms_fr_short.csv", + ] + df_list = [] + for csv in ms_csv_files: + df_list.append(pd.read_csv(csv, sep=";")) + df = pd.concat(df_list) + + # Rename columns + df = df.rename( + columns={ + "startlanguage": "XXcountry", + "lastpage": "XXlastpage", + "MFfoi2": "MSfoi2", + "MSfoi5[SQ01]": "MSfoi5[not_aware]", + "MSfoi5[SQ02]": "MSfoi5[not_covered]", + "MSfoi5[SQ03]": "MSfoi5[too_expensive]", + "MSfoi5[SQ04]": "MSfoi5[too_time_consuming]", + "MSfoi5[SQ05]": "MSfoi5[afraid_of_data_destruction]", + "MSfoi5[SQ06]": "MSfoi5[afraid_of_discrimination]", + "MSfoi5[SQ07]": "MSfoi5[other]", + "MSfoi5[SQ08]": "MSfoi5[dont_know]", + "MSfoi5[SQ09]": "MSfoi5[prefer_not_to_say]", + "MSfoi5specify": "MSfoi5other", + "MScontstraintinter1": "MSconstraintinter1", + "MSprotectleg2A": "MSprotectleg2", + "MSprotectleg2Ano": "MSprotectleg2no", + "MSprotectops1[SQ01]": "MSprotectops1[sectraining]", + "MSprotectops1[SQ03]": "MSprotectops1[e2e]", + "MSprotectops3[SQ01]": "MSprotectops3[encrypted_email]", + "MSprotectops3[SQ02]": "MSprotectops3[vpn]", + "MSprotectops3[SQ03]": "MSprotectops3[tor]", + "MSprotectops3[SQ04]": "MSprotectops3[e2e_chat]", + "MSprotectops3[SQ05]": "MSprotectops3[encrypted_hardware]", + "MSprotectops3[SQ06]": "MSprotectops3[2fa]", + "MSprotectops3[SQ08]": "MSprotectops3[other]", + "MSprotectleg3[SQ01]": "MSprotectleg3[free_counsel]", + "MSprotectleg3[SQ02]": "MSprotectleg3[cost_insurance]", + "MSprotectleg3[SQ03]": "MSprotectleg3[other]", + "MSconstraintinter4[SQ01]": "MSconstraintinter4[police_search]", + "MSconstraintinter4[SQ02]": "MSconstraintinter4[seizure]", + "MSconstraintinter4[SQ03]": "MSconstraintinter4[extortion]", + "MSconstraintinter4[SQ04]": "MSconstraintinter4[violent_threat]", + "MSconstraintinter4[SQ05]": "MSconstraintinter4[inspection_during_travel]", + "MSconstraintinter4[SQ06]": "MSconstraintinter4[detention]", + "MSconstraintinter4[SQ07]": "MSconstraintinter4[surveillance_signalling]", + "MSconstraintinter4[SQ08]": "MSconstraintinter4[online_harassment]", + "MSconstraintinter4[SQ09]": "MSconstraintinter4[entry_on_deny_lists]", + "MSconstraintinter4[SQ10]": "MSconstraintinter4[exclusion_from_events]", + "MSconstraintinter4[SQ11]": "MSconstraintinter4[public_defamation]", + "MSconstraintinter5[SQ01]": "MSconstraintinter5[unsolicited_information]", + "MSconstraintinter5[SQ02]": "MSconstraintinter5[invitations]", + "MSconstraintinter5[SQ03]": "MSconstraintinter5[other]", + "MSconstraintinter5ot": "MSconstraintinter5other", + "MSconstraintinter6[SQ01]": "MSconstraintinter6[gender]", + "MSconstraintinter6[SQ02]": "MSconstraintinter6[ethnicity]", + "MSconstraintinter6[SQ03]": "MSconstraintinter6[political]", + "MSconstraintinter6[SQ04]": "MSconstraintinter6[sexual]", + "MSconstraintinter6[SQ05]": "MSconstraintinter6[religious]", + "MSconstraintinter6[SQ06]": "MSconstraintinter6[other]", + "MSconstraintinter6ot": "MSconstraintinter6other", + "MSattitude3[SQ01]": "MSattitude3[rule_of_law]", + "MSattitude3[SQ02]": "MSattitude3[civil_liberties]", + "MSattitude3[SQ03]": "MSattitude3[effectiveness_of_intel]", + "MSattitude3[SQ04]": "MSattitude3[legitimacy_of_intel]", + "MSattitude3[SQ05]": "MSattitude3[trust_in_intel]", + "MSattitude3[SQ06]": "MSattitude3[critique_of_intel]", + "MSattitude3[SQ07]": "MSattitude3[prefer_not_to_say]", + } + ) + df = df.replace(to_replace=r"en", value="United Kingdom") + df = df.replace(to_replace=r"de", value="Germany") + df = df.replace(to_replace=r"fr", value="France") + + # Drop (very) incomplete surveys + df = df[df["XXlastpage"] > 2] + + # Drop all but the columns needed for the analysis + df = df[ + [ + "XXcountry", + "XXlastpage", + "MShr1", + "MShr2", + "MSgender", + "MSexpertise1", + "MSexpertise2", + "MSexpertise3", + "MSexpertise4", + "MSfinance1", + "MSfoi1", + "MSfoi2", + "MSfoi3", + "MSfoi4", + "MSfoi5[not_aware]", + "MSfoi5[not_covered]", + "MSfoi5[too_expensive]", + "MSfoi5[too_time_consuming]", + "MSfoi5[afraid_of_data_destruction]", + "MSfoi5[afraid_of_discrimination]", + "MSfoi5[other]", + "MSfoi5[dont_know]", + "MSfoi5[prefer_not_to_say]", + "MSfoi5other", + "MSprotectops1[sectraining]", + "MSprotectops1[e2e]", + "MSprotectops2", + "MSprotectops3[encrypted_email]", + "MSprotectops3[vpn]", + "MSprotectops3[tor]", + "MSprotectops3[e2e_chat]", + "MSprotectops3[encrypted_hardware]", + "MSprotectops3[2fa]", + "MSprotectops3[other]", + "MSprotectops3other", + "MSprotectops4", + "MSprotectleg1", + "MSprotectleg2", + "MSprotectleg2no", + "MSprotectleg3[free_counsel]", + "MSprotectleg3[cost_insurance]", + "MSprotectleg3[other]", + "MSprotectleg3other", + "MSconstraintinter1", + "MSconstraintinter2", + "MSconstraintinter3", + "MSconstraintinter4[police_search]", + "MSconstraintinter4[seizure]", + "MSconstraintinter4[extortion]", + "MSconstraintinter4[violent_threat]", + "MSconstraintinter4[inspection_during_travel]", + "MSconstraintinter4[detention]", + "MSconstraintinter4[surveillance_signalling]", + "MSconstraintinter4[online_harassment]", + "MSconstraintinter4[entry_on_deny_lists]", + "MSconstraintinter4[exclusion_from_events]", + "MSconstraintinter4[public_defamation]", + "MSconstraintinter5[unsolicited_information]", + "MSconstraintinter5[invitations]", + "MSconstraintinter5[other]", + "MSconstraintinter5other", + "MSconstraintinter6[gender]", + "MSconstraintinter6[ethnicity]", + "MSconstraintinter6[political]", + "MSconstraintinter6[sexual]", + "MSconstraintinter6[religious]", + "MSconstraintinter6[other]", + "MSconstraintinter6other", + "MSattitude1", + "MSattitude2", + "MSattitude3[rule_of_law]", + "MSattitude3[civil_liberties]", + "MSattitude3[effectiveness_of_intel]", + "MSattitude3[legitimacy_of_intel]", + "MSattitude3[trust_in_intel]", + "MSattitude3[critique_of_intel]", + "MSattitude3[prefer_not_to_say]", + "MSattitude4[1]", + "MSattitude4[2]", + "MSattitude4[3]", + "MSattitude4[4]", + "MSattitude4[5]", + "MSattitude4[6]", + "MSattitude5[1]", + "MSattitude5[2]", + "MSattitude5[3]", + "MSattitude5[4]", + "MSattitude5[5]", + "MSattitude5[6]", + "MSattitude6[1]", + "MSattitude6[2]", + "MSattitude6[3]", + "MSattitude6[4]", + "MSattitude6[5]", + "MSattitude6[6]", + ] + ] + + # Make column names compatible + df.columns = df.columns.str[2:] + + # Set surveytype + df["surveytype"] = "Media Scrutiny" + return df + + +# =========================================================================== +# Merge MS and CS DataFrames +# =========================================================================== +df_cs = construct_cs_df() +df_ms = construct_ms_df() +df = pd.concat([df_cs, df_ms], ignore_index=True) + +# Helper variables needed when answers are coded differently in the +# respective survey types or languages +is_civsoc = df.surveytype == "Civil Society Scrutiny" +is_media = df.surveytype == "Media Scrutiny" +is_de = df.country == "Germany" +is_uk = df.country == "United Kingdom" +is_fr = df.country == "France" + +df.loc[is_civsoc, "hr1"] = df["hr1"].replace( + { + "AO01": "Full-time", + "AO02": "Part-time (>50%)", + "AO03": "Part-time (<50%)", + "AO04": "Freelance", + "AO05": "Unpaid", + "AO06": "Other", + "AO07": "I don't know", + "AO08": "I prefer not to say", + } +) +df.loc[is_media, "hr1"] = df["hr1"].replace( + { + "AO01": "Full-time", + "AO02": "Part-time (>50%)", + "AO03": "Part-time (<50%)", + "AO04": "Freelance", + "AO05": "Unpaid", + "AO08": "Other", + "AO06": "I don't know", + "AO07": "I prefer not to say", + } +) + +df["gender"] = df["gender"].fillna("Not specified") +df["gender"] = df["gender"].replace( + { + "AO01": "Female", + "AO02": "Non-binary", + "AO03": "Male", + "AO04": "I prefer not to say", + "AO05": "Other", + } +) + +df["expertise2"] = df["expertise2"].replace( + { + "AO01": "Expert knowledge", + "AO02": "Advanced knowledge", + "AO03": "Some knowledge", + "AO04": "Basic knowledge", + "AO05": "No knowledge", + "AO06": "I don't know", + "AO07": "I prefer not to say", + } +) + +df["expertise3"] = df["expertise3"].replace( + { + "AO01": "Expert knowledge", + "AO02": "Advanced knowledge", + "AO03": "Some knowledge", + "AO04": "Basic knowledge", + "AO05": "No knowledge", + "AO06": "I don't know", + "AO07": "I prefer not to say", + } +) + +df["expertise4"] = df["expertise4"].replace( + { + "AO01": "Expert knowledge", + "AO02": "Advanced knowledge", + "AO03": "Some knowledge", + "AO04": "Basic knowledge", + "AO05": "No knowledge", + "AO06": "I don't know", + "AO07": "I prefer not to say", + } +) + +df["finance1"] = df["finance1"].replace( + { + "AO01": "A great deal of funding", + "AO02": "Sufficient funding", + "AO03": "Some funding", + "AO04": "Little funding", + "AO05": "No funding", + "AO06": "I don't know", + "AO07": "I prefer not to say", + } +) + +df["foi1"] = df["foi1"].replace( + { + "AO01": "Yes", + "AO02": "No", + "AO03": "I don't know", + "AO04": "I prefer not to say", + } +) + +df["foi3"] = df["foi3"].replace( + { + "AO01": "Yes, within 30 days", + "AO02": "No, usually longer than 30 days", + "AO03": "Never", + "AO04": "I don't know", + "AO05": "I prefere not to say", + } +) + +df.loc[is_civsoc, "foi4"] = df["foi4"].replace( + { + "AO01": "Very helpful", + "AO03": "Helpful in parts", + "AO05": "Not helpful at all", + "AO06": "I don't know", + "AO07": "I prefer not to say", + } +) + +df.loc[is_media, "foi4"] = df["foi4"].replace( + { + "AO01": "Very helpful", + "AO02": "Helpful in parts", + "AO03": "Not helpful at all", + "AO06": "I don't know", + "AO07": "I prefer not to say", + } +) + +df["protectops1[sectraining]"] = df["protectops1[sectraining]"].replace( + { + "AO01": "Yes", + "AO02": "No", + "AO03": "I don't know", + "AO04": "I prefer not to say", + } +) + +df["protectops1[e2e]"] = df["protectops1[e2e]"].replace( + { + "AO01": "Yes", + "AO02": "No", + "AO03": "I don't know", + "AO04": "I prefer not to say", + } +) + +df["protectops2"] = df["protectops2"].replace( + { + "AO01": "Yes", + "AO02": "No", + "AO03": "I don't know", + "AO04": "I prefer not to say", + } +) + +protectops3_options = [ + "encrypted_email", + "vpn", + "tor", + "e2e_chat", + "encrypted_hardware", + "2fa", + "other", +] +for label in protectops3_options: + df[f"protectops3[{label}]"] = df[f"protectops3[{label}]"].replace( + { + "AO01": "Very important", + "AO02": "Important", + "AO03": "Somewhat important", + "AO04": "Slightly important", + "AO05": "Not important at all", + "AO06": "I don't know", + "AO07": "I prefer not to say", + } + ) +# this was hard to spot. Only the CS survey for DE was coded as below +for label in protectops3_options: + df.loc[(is_civsoc) & (is_de), f"protectops3[{label}]"] = df[ + f"protectops3[{label}]" + ].replace( + { + "AO01": "Very important", + "AO02": "Important", + "AO03": "Somewhat important", + "AO04": "Slightly important", + "AO05": "Not important at all", + # notice the AO09 instead of AO06 as above + "AO09": "I don't know", + "AO11": "I prefer not to say", + } + ) + +df["protectops4"] = df["protectops4"].replace( + { + "AO01": "I have full confidence that the right tools
will protect my communication from surveillance", + "AO02": "Technological tools help to protect my identity
to some extent, but an attacker with sufficient power
may eventually be able to bypass my technological
safeguards", + "AO03": "Under the current conditions of communications
surveillance, technological solutions cannot offer
sufficient protection for the data I handle", + "AO04": "I have no confidence in the protection offered by
technological tools", + "AO05": "I try to avoid technology-based communication whenever
possible when I work on intelligence-related issues", + "AO06": "I don't know", + "AO07": "I prefer not to say", + } +) + +df["protectleg1"] = df["protectleg1"].replace( + { + "AO01": "Always", + "AO02": "Often", + "AO03": "Sometimes", + "AO04": "Rarely", + "AO05": "Never", + "AO06": "I don't know", + "AO07": "I prefer not to say", + } +) + +df["protectleg2"] = df["protectleg2"].replace( + { + "AO01": "Yes", + "AO02": "No", + "AO03": "I don't know", + "AO04": "I prefer not to say", + } +) + +for label in ["free_counsel", "cost_insurance", "other"]: + df[f"protectleg3[{label}]"] = df[f"protectleg3[{label}]"].replace( + { + "AO01": "Yes", + "AO02": "No", + "AO03": "I don't know", + "AO04": "I prefer not to say", + } + ) + +df["constraintinter1"] = df["constraintinter1"].replace( + { + "AO01": "Yes, I have evidence", + "AO02": "Yes, I suspect", + "AO03": "No", + "AO04": "I don't know", + "AO05": "I prefer not to say", + } +) + +df["constraintinter2"] = df["constraintinter2"].replace( + { + "AO01": "Yes", + "AO02": "No", + "AO03": "I don't know", + "AO04": "I prefer not to say", + } +) + +df["constraintinter3"] = df["constraintinter3"].replace( + { + "AO01": "I was threatened with prosecution", + "AO02": "I was prosecuted but acquitted", + "AO03": "I was prosecuted and convicted", + "AO04": "I don't know", + "AO05": "I prefer not to say", + } +) + +constraintinter4_options = [ + "police_search", + "seizure", + "extortion", + "violent_threat", + "inspection_during_travel", + "detention", + "surveillance_signalling", + "online_harassment", + "entry_on_deny_lists", + "exclusion_from_events", + "public_defamation", +] + +for label in constraintinter4_options: + df[f"constraintinter4[{label}]"] = df[f"constraintinter4[{label}]"].replace( + { + "AO01": "Yes", + "AO02": "No", + "AO03": "I don't know", + "AO04": "I prefer not to say", + } + ) + +for label in ["unsolicited_information", "invitations", "other"]: + df[f"constraintinter5[{label}]"] = df[f"constraintinter5[{label}]"].replace( + { + "AO01": "Yes", + "AO02": "No", + "AO03": "I don't know", + "AO04": "I prefer not to say", + } + ) + +for label in ["gender", "ethnicity", "political", "sexual", "religious", "other"]: + df[f"constraintinter6[{label}]"] = df[f"constraintinter6[{label}]"].replace( + { + "AO01": "Yes", + "AO02": "No", + "AO03": "I don't know", + "AO04": "I prefer not to say", + } + ) + +df["attitude1"] = df["attitude1"].replace( + { + "AO01": "Intelligence agencies are incompatible with democratic
values and should be abolished", + "AO02": "Intelligence agencies contradict democratic principles,
and their powers should be kept at a bare minimum", + "AO03": "Intelligence agencies are necessary and legitimate institutions
of democratic states, even though they may sometimes overstep
their legal mandates", + "AO04": "Intelligence agencies are a vital component of national
security and should be shielded from excessive bureaucratic
restrictions", + "AO05": "I prefer not to say", + } +) + +df["attitude2"] = df["attitude2"].replace( + { + "AO01": "Intelligence oversight generally succeeds in uncovering
past misconduct and preventing future misconduct", + "AO02": "Intelligence oversight is mostly effective, however its
institutional design needs reform for oversight practitioners
to reliably uncover past misconduct and prevent future
misconduct", + "AO03": "Intelligence oversight lacks efficacy, hence a fundamental
reorganization of oversight capacity is needed for oversight
practitioners to reliably uncover past misconduct and
prevent future misconduct", + "AO04": "Effective intelligence oversight is a hopeless endeavour
and even a systematic reorganization is unlikely to ensure
misconduct is uncovered and prevented.", + "AO05": "I prefer not to say", + } +) + +for i in range(4, 7): + for j in range(1, 7): + df[f"attitude{i}[{j}]"] = df[f"attitude{i}[{j}]"].replace( + { + "AO01": "Parliamentary oversight bodies", + "AO02": "Judicial oversight bodies", + "AO03": "Independent expert bodies", + "AO04": "Data protection authorities", + "AO05": "Audit courts", + "AO06": "CSOs | The media", + } + ) + # Here, CS FR is coded differently + df.loc[(is_fr) & (is_civsoc), f"attitude{i}[{j}]"] = df[ + f"attitude{i}[{j}]" + ].replace( + { + "AO01": "Parliamentary oversight bodies", + "AO02": "Judicial oversight bodies", + "AO03": "Independent expert bodies", + "AO04": "Data protection authorities", + "AO07": "Audit courts", + "AO06": "CSOs | The media", + } + ) + +# =========================================================================== +# Make answers analysable (change data types etc.) +# =========================================================================== + +df["hr2"] = df["hr2"].replace("?", np.nan) +df["hr2"] = df["hr2"].replace("0,5", 0.5) +df["hr2"] = pd.to_numeric(df["hr2"], errors="coerce") + +df["expertise1"] = df["expertise1"].replace("?", np.nan) +df["expertise1"] = df["expertise1"].replace("<1", 0.5) +df["expertise1"] = pd.to_numeric(df["expertise1"], errors="coerce") + +df["foi2"] = df["foi2"].replace( + {"20+": 20.0, " ca 10": 10.0, "several": 3.0, "15+": 15.0} +) +df["foi2"] = pd.to_numeric(df["foi2"], errors="coerce") + +# Here, I change the datatype to boolean for all the multiple choice answers +for col in df: + if col.startswith("foi5[") or col.startswith("attitude3"): + df[col] = df[col].replace(np.nan, False) + df[col] = df[col].replace("Y", True) + df[col] = df[col].astype("bool") + +# =========================================================================== +# Export data to file +# =========================================================================== + +df.to_pickle("data/merged.pkl") +df.to_excel("data/merged.xlsx") +df.to_csv("data/merged.csv")