TODO: New senator bioguide IDs.

2024 Election Results From https://clerk.house.gov/xml/lists/unofficial-119-member-elect-data.xml, birthdays from Wikipedia, and review by the GovTrack team.
unitedstates · Dec 29, 2024 · aae0543 · aae0543
1 parent 9b024af
commit aae0543
Show file tree

Hide file tree

Showing 9 changed files with 10,075 additions and 23,636 deletions.
diff --git a/committee-membership-current.yaml b/committee-membership-current.yaml
diff --git a/legislators-current.yaml b/legislators-current.yaml
diff --git a/legislators-district-offices.yaml b/legislators-district-offices.yaml
diff --git a/legislators-historical.yaml b/legislators-historical.yaml
diff --git a/legislators-social-media.yaml b/legislators-social-media.yaml
diff --git a/scripts/archive/election_results_2024.csv b/scripts/archive/election_results_2024.csv
diff --git a/scripts/election_results.py b/scripts/election_results.py
@@ -31,8 +31,7 @@
 #   git checkout origin/main ../*.yaml
 # * Run this script.
 # * Make other changes manually for special elections.
-# * Run sweep.py to clear out social media info for now-not-serving legislators.
-# * Run wikidata_update.py to fill in some other missing fields.
+# * Run wikidata_update.py to fill in some other fields.
 # * Run `NOW=2023-01-03 test/validate.py` to check for errors.
 
 import traceback
@@ -41,7 +40,7 @@
 import collections, csv, re
 from utils import load_data, save_data
 
-ELECTION_YEAR = 2022
+ELECTION_YEAR = 2024
 
 def run():
 	# Compute helper constants.
@@ -64,13 +63,14 @@ def run():
 	for p in legislators_current:
 		if p["terms"][-1]["type"] == "sen" and p["terms"][-1]["class"] != SENATE_CLASS:
 			current.append(p["id"]["govtrack"])
-		if p["terms"][-1]["state"] == "PR" and (ELECTION_YEAR % 4 == 0):
+		if p["terms"][-1]["state"] == "PR" and (ELECTION_YEAR % 4 != 0):
 			current.append(p["id"]["govtrack"])
 
-	# Map govtrack IDs to existing legislators.
-	govtrack_id_map = { }
+	# Map bioguide IDs to existing legislators to read the Bioguide ID
+	# column of the CSV file.
+	bioguide_id_map = { }
 	for entry in legislators_historical + legislators_current:
-		govtrack_id_map[entry['id']['govtrack']] = entry
+		bioguide_id_map[entry['id']['bioguide']] = entry
 
 	# Get highest existing GovTrack ID to know where to start for assigning new IDs.
 	# Store it in a mutable data structure so that the inner function can increment it.
@@ -85,48 +85,33 @@ def process_row(row):
 		# district means a senate race.
 		state, district = re.match(r"^([A-Z]{2})(\d*)$", row["Race"]).groups()
 
-		if row['GovTrack ID'] != "":
-			# Use the GovTrack ID to get the legislator who won, which might be
+		if row['Bioguide ID'] in bioguide_id_map:
+			# Use the Bioguide ID to get the legislator who won, which might be
 			# the incumbent or a representative elected to the senate, or
-			# someone who used to serve in Congress, etc.
-			p = govtrack_id_map[int(row['GovTrack ID'])]
-		elif row['Incumbent Win? Y/N'] == 'Y':
-			raise ValueError("Incumbent should have a GovTrack ID.")
-
-			# Use the race code to get the legislator who won.
-			incumbent = [p for p in legislators_current
-			             if  p["terms"][-1]["type"] == ("sen" if district == "" else "rep")
-			             and p["terms"][-1]["state"] == state
-			             and ((p["terms"][-1]["district"] == int(district)) if district != ""
-		             	 else (p["terms"][-1]["class"] == SENATE_CLASS))
-			            ]
-			if len(incumbent) < 1:
-				raise ValueError("Could not find incumbent.")
-			if len(incumbent) > 1:
-				raise ValueError("Matched on more than one incumbent.")
-			p = incumbent[0]
-			#if row['GovTrack ID'] != "" and int(row['GovTrack ID']) != p["id"]["govtrack"]:
-			#	raise ValueError("GovTrack ID doesn't match incumbent.")
-		elif row['Incumbent Win? Y/N'] == 'N':
+			# someone who previously served in Congress, etc. The House provides
+			# draft IDs for new members, so the ID in the spreadsheet may not
+			# match an existing person.
+			p = bioguide_id_map[row['Bioguide ID']]
+		else:
 			# Make a new legislator entry.
 			max_govtrack_id.value += 1
 			p = collections.OrderedDict([
 				("id", collections.OrderedDict([
-					#("bioguide", row['Bioguide ID']),
+					("bioguide", row['Bioguide ID'] if row['Bioguide ID'] != "(not assigned)" else None),
 					("fec", [row['FEC.gov ID']]),
 					("govtrack", max_govtrack_id.value),
 					#("opensecrets", None), # don't know yet
 					#("votesmart", int(row['votesmart'])), # not doing this anymore
-					#("wikipedia", row['Wikipedia Page Name']), # will convert from Wikidata
-					("wikidata", row['Wikidata ID']),
+					("wikipedia", row['Wikipedia URL'].replace("https://en.wikipedia.org/wiki/", "").replace("_", " ")),
+					#("wikidata", row['Wikidata ID']), # will convert from wikipedia
 					#("ballotpedia", row['Ballotpedia Page Name']),
 				])),
 				("name", collections.OrderedDict([
 					("first", row['First Name']),
 					("middle", row['Middle Name']),
 					("last", row['Last Name']),
 					("suffix", row['Suffix']),
-					#("official_full", mi.find('official-name').text), #not available yet
+					("official_full", row['Name']), # best guess
 				])),
 				("bio", collections.OrderedDict([
 				 	("gender", row['Gender (M/F)']),
@@ -136,20 +121,14 @@ def process_row(row):
 			])
 
 			# Delete keys that were filled with Nones or empty strings
-			# because we don't have the data yet.
+			# because we don't have the data yet, other than Bioguide ID
+			# because we'll need that to be filled in manually anyway.
 			for section in ("id", "name", "bio"):
 				for k in list(p[section]): # clone key list before modifying dict
-					if not p[section][k]:
+					if not p[section][k] and not (section == "id" and k == "bioguide"):
 						del p[section][k]
 
 			new_legislators.append(p)
-		else:
-			# There is no winner in this election. The incumbent
-			# will not be marked as still serving, so they'll
-			# be moved to the historical file, and no person will
-			# be added for this race.
-			print("No election result for", row["Race"], row["Incumbent Win? Y/N"])
-			return
 
 		# Add to array marking this legislator as currently serving.
 		current.append(p['id']['govtrack'])
@@ -172,21 +151,27 @@ def process_row(row):
 				("state", state),
 				("district", int(district)),
 			])
+
 		# If party is given in the table (for some incumbents and
 		# all new winners), use it. Otherwise just make a field so
 		# it's in the right order.
 		term.update(collections.OrderedDict([
-			("party", party_map[row['Party']] if row['Party'] else None),
+			("party", party_map[row['Party (D/R/I)']] if row['Party (D/R/I)'] else None),
 		]))
 		p['terms'].append(term)
 		if term['party'] == "Independent":
 			term["caucus"] = row['Caucus']
 
-		if len(p['terms']) > 1 and p["terms"][-2]["type"] == term["type"]:
-			# This is an incumbent (or at least served in the same chamber previously).
+		if len(p['terms']) > 1:
+			# This is an incumbent or at least served previously.
 			# Copy some fields forward that are likely to remain the same, if we
 			# haven't already set them.
-			for k in ('party', 'url', 'rss_url'):
+			for k in ('party', 'caucus'):
+				if k in p['terms'][-2] and not term.get(k):
+					term[k] = p['terms'][-2][k]
+		if len(p['terms']) > 1 and p["terms"][-2]["type"] == term["type"]:
+			# Copy some more fields if the last term was in the same chamber.
+			for k in ('url', 'rss_url'):
 				if k in p['terms'][-2] and not term.get(k):
 					term[k] = p['terms'][-2][k]
 
@@ -270,8 +255,8 @@ def process_row(row):
 
 	# Run the sweep script to clear out data that needs to be cleared out
 	# for legislators that are gone.
-	#import sweep
-	#sweep.run()
+	import sweep
+	sweep.run()
 
 	# Clears committee membership.
 	save_data({}, "committee-membership-current.yaml")

diff --git a/scripts/utils.py b/scripts/utils.py
@@ -390,7 +390,7 @@ def fixup(m):
         pass
     return text # leave as is
 
-  text = re.sub("&#?\w+;", fixup, text)
+  text = re.sub(r"&#?\w+;", fixup, text)
   text = remove_unicode_control(text)
   return text
 

diff --git a/scripts/wikidata_update.py b/scripts/wikidata_update.py
@@ -1,10 +1,23 @@
 #!/usr/bin/python
 
 import re
-from urllib.parse import unquote
+import urllib.request
+import json
+from urllib.parse import quote, unquote
 from utils import load_data, save_data
 from SPARQLWrapper import SPARQLWrapper, JSON
 
+def get_wikidata_ids(legislators):
+    # Look up wikidata IDs for legislators with English Wikipedia IDs.
+    for p in legislators:
+        if not p["id"].get("wikidata") and p["id"].get("wikipedia"):
+            w = quote(p["id"]["wikipedia"].replace(" ", "_"))
+            query_url = f"https://en.wikipedia.org/w/api.php?action=query&prop=pageprops&titles={w}&format=json"
+            response = json.load(urllib.request.urlopen(query_url))
+            wikidata_id = list(response["query"]["pages"].values())[0]["pageprops"]["wikibase_item"]
+            p["id"]["wikidata"] = wikidata_id
+
+
 def get_ids_from_wikidata(legislators):
     # Query to fetch information for entities that have a bioguide ID.
     # Selecting on bioguide ID efficiently gets wikidata entries that
@@ -133,6 +146,7 @@ def run_query(query):
 def run():
   p1 = load_data("legislators-current.yaml")
   p2 = load_data("legislators-historical.yaml")
+  get_wikidata_ids(p1+p2)
   get_ids_from_wikidata(p1+p2)
   get_ids_from_wikidata_without_bioguide(p1+p2)
   save_data(p1, "legislators-current.yaml")