-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmade-smarter-postcode-clean.py
122 lines (105 loc) · 4.53 KB
/
made-smarter-postcode-clean.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#%%
import polars as pl
import duckdb
from janitor.polars import clean_names
import polars.selectors as cs
#%%
def get_sheet(file_path: str, sheet_name: str) -> pl.DataFrame:
return (pl
.read_excel(file_path, sheet_name = sheet_name)
.clean_names(remove_special = True))
#%%
def clean_postcode(df: pl.DataFrame, postcode_col: str) -> pl.DataFrame:
"""
Cleans and validates UK postcodes in a Polars DataFrame.
This function processes a specified column containing UK postcodes by:
- Removing spaces and converting to uppercase.
- Splitting the postcode into start and end segments.
- Reconstructing the cleaned postcode.
- Validating the cleaned postcode against a regex pattern.
Args:
df (pl.DataFrame): The input Polars DataFrame containing postcode data.
postcode_col (str): The name of the column containing postcodes. If None, the function will attempt to find a column with "postcode" in its name.
Returns:
pl.DataFrame: A new DataFrame with the cleaned and validated postcodes, and a boolean column indicating validity.
Raises:
ValueError: If no postcode column is found in the DataFrame.
"""
if postcode_col is None:
postcode_col = next((col
for col
in df.columns
if "postcode" in col.lower()),
None)
if postcode_col is None:
raise ValueError("No postcode column found")
return (df
.with_columns(pl.col(postcode_col)
.str
.replace_all(" ", "")
.str
.to_uppercase()
.alias("postcode"))
.with_columns(pl.col("postcode").str.slice(-3).alias("postcode_end"))
.with_columns(
pl.when(pl.col("postcode").str.len_chars() == 7)
.then(pl.col("postcode").str.slice(0, 4))
.otherwise(pl.col("postcode").str.slice(0, 3))
.alias("postcode_start")
)
.with_columns(
pl.concat_str([pl.col("postcode_start"),
pl.col("postcode_end")],
separator=" ")
.alias("postcode_clean"))
.with_columns(valid_pc = pl.col("postcode_clean")
.str
.contains("^(?:(?:[A-Z]{1,2}[0-9][0-9]?[A-Z]?[ ]?[0-9][A-Z]{2})|(?:[A-Z]{1}[0-9]{1,2}[ ]?[0-9][A-Z]{2}))$")
.alias("valid_pc"))
.drop(["postcode_start", "postcode_end", "postcode", postcode_col])
)
#%%
df_applicant = get_sheet(file_path = "data/Made Smarter postcodes.xlsx",
sheet_name = "Full MS applicant list")
#%%
df_onboarded = get_sheet(file_path = "data/Made Smarter postcodes.xlsx",
sheet_name = "Onboarded MS businesses")
#%%
df_applicant_clean = (clean_postcode(df_applicant,
postcode_col = None)
.with_columns(type = pl.lit("Applicant")))
df_onboarded_clean = (clean_postcode(df_onboarded,
postcode_col = None)
.with_columns(type = pl.lit("Onboarded")))
#%%
df_made_smarter_all_df = (pl.concat([df_applicant_clean, df_onboarded_clean],
how = "vertical")
.filter(pl.col("valid_pc"))
.group_by("postcode_clean", "type")
.len("count")
.sort("count", descending = True))
#%%
df_made_smarter_all_df.glimpse()
#%%
con = duckdb.connect("../weca_cesap/data/postcodes.duckdb")
#%%
con.sql("SHOW TABLES;")
# %%
all_postcodes = con.sql("FROM postcode_centroids_tbl").pl()
#%%
all_postcodes.glimpse()
#%%
made_smarter_ods_df = (all_postcodes
.join(df_made_smarter_all_df,
left_on="pcds",
right_on = "postcode_clean",
how="inner")
.select(["pcds", "lat", "long", "type", "count"])
.with_columns(pl.concat_str([pl.lit("{"), pl.col("lat"), pl.lit(", "), pl.col("long"), pl.lit("}")]).alias("geo_point_2d"))
.sort(["type", "count"])
)
#%%
made_smarter_ods_df.write_csv("data/made_smarter_ods.csv")
#%%
con.close()
# %%