diff --git a/Cargo.toml b/Cargo.toml index e88c248e3..9a89164ea 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -66,6 +66,7 @@ features = [ "arange", "true_div", "dtype-categorical", + "string_justify", "diagonal_concat", "horizontal_concat", "abs", diff --git a/__tests__/expr.test.ts b/__tests__/expr.test.ts index 80f9f833f..fa407a55c 100644 --- a/__tests__/expr.test.ts +++ b/__tests__/expr.test.ts @@ -1190,6 +1190,54 @@ describe("expr.str", () => { expect(actual).toFrameEqual(expected); expect(seriesActual).toFrameEqual(expected); }); + test("padStart", () => { + const df = pl.DataFrame({ + foo: ["a", "b", "cow", "longer"], + }); + const expected = pl.DataFrame({ + foo: ["__a", "__b", "cow", "longer"], + }); + const seriesActual = df + .getColumn("foo") + .str.padStart(3, "_") + .rename("foo") + .toFrame(); + const actual = df.select(col("foo").str.padStart(3, "_").as("foo")); + expect(actual).toFrameEqual(expected); + expect(seriesActual).toFrameEqual(expected); + }); + test("padEnd", () => { + const df = pl.DataFrame({ + foo: ["a", "b", "cow", "longer"], + }); + const expected = pl.DataFrame({ + foo: ["a__", "b__", "cow", "longer"], + }); + const seriesActual = df + .getColumn("foo") + .str.padEnd(3, "_") + .rename("foo") + .toFrame(); + const actual = df.select(col("foo").str.padEnd(3, "_").as("foo")); + expect(actual).toFrameEqual(expected); + expect(seriesActual).toFrameEqual(expected); + }); + test("zFill", () => { + const df = pl.DataFrame({ + foo: ["a", "b", "cow", "longer"], + }); + const expected = pl.DataFrame({ + foo: ["00a", "00b", "cow", "longer"], + }); + const seriesActual = df + .getColumn("foo") + .str.zFill(3) + .rename("foo") + .toFrame(); + const actual = df.select(col("foo").str.zFill(3).as("foo")); + expect(actual).toFrameEqual(expected); + expect(seriesActual).toFrameEqual(expected); + }); test("hex encode", () => { const df = pl.DataFrame({ original: ["foo", "bar", null], diff --git a/__tests__/lazyframe.test.ts b/__tests__/lazyframe.test.ts index 4e0e580bd..84430eabe 100644 --- a/__tests__/lazyframe.test.ts +++ b/__tests__/lazyframe.test.ts @@ -1042,4 +1042,43 @@ describe("lazyframe", () => { }); expect(actual).toFrameEqual(expected); }); + test("str:padStart", () => { + const actual = pl.DataFrame({ + "ham": ["a", "b", "c"] + }).lazy() + .withColumn( + pl.col("ham").str.padStart(3, "-") + ) + .collectSync(); + const expected = pl.DataFrame({ + "ham": ["--a", "--b", "--c"] + }); + expect(actual).toFrameEqual(expected); + }); + test("str:padEnd", () => { + const actual = pl.DataFrame({ + "ham": ["a", "b", "c"] + }).lazy() + .withColumn( + pl.col("ham").str.padEnd(3, "-") + ) + .collectSync(); + const expected = pl.DataFrame({ + "ham": ["a--", "b--", "c--"] + }); + expect(actual).toFrameEqual(expected); + }); + test("str:zFill", () => { + const actual = pl.DataFrame({ + "ham": ["a", "b", "c"] + }).lazy() + .withColumn( + pl.col("ham").str.zFill(3) + ) + .collectSync(); + const expected = pl.DataFrame({ + "ham": ["00a", "00b", "00c"] + }); + expect(actual).toFrameEqual(expected); + }); }); diff --git a/polars/lazy/expr/string.ts b/polars/lazy/expr/string.ts index b74694b06..906dc8624 100644 --- a/polars/lazy/expr/string.ts +++ b/polars/lazy/expr/string.ts @@ -51,8 +51,8 @@ export interface ExprString { * └─────────┘ * ``` */ - decode(encoding: "hex" | "base64", strict?: boolean): Expr - decode(options: {encoding: "hex" | "base64", strict?: boolean}): Expr + decode(encoding: "hex" | "base64", strict?: boolean): Expr; + decode(options: { encoding: "hex" | "base64"; strict?: boolean }): Expr; /** * Encodes a value using the provided encoding * @param encoding - hex | base64 @@ -74,7 +74,7 @@ export interface ExprString { * └─────────┘ * ``` */ - encode(encoding: "hex" | "base64"): Expr + encode(encoding: "hex" | "base64"): Expr; /** * Extract the target capture group from provided patterns. * @param pattern A valid regex pattern @@ -140,7 +140,7 @@ export interface ExprString { /** Get length of the string values in the Series. */ lengths(): Expr; /** Remove leading whitespace. */ - lstrip(): Expr + lstrip(): Expr; /** Replace first regex match with a string value. */ replace(pat: string | RegExp, val: string): Expr; /** Replace all regex matches with a string value. */ @@ -150,7 +150,105 @@ export interface ExprString { /** Modify the strings to their uppercase equivalent. */ toUpperCase(): Expr; /** Remove trailing whitespace. */ - rstrip(): Expr + rstrip(): Expr; + /** + * Add a leading fillChar to a string until string length is reached. + * If string is longer or equal to given length no modifications will be done + * @param {number} length - of the final string + * @param {string} fillChar - that will fill the string. + * @note If a string longer than 1 character is provided only the first character will be used + * @example + * ``` + * > df = pl.DataFrame({ + * ... 'foo': [ + * ... "a", + * ... "b", + * ... "LONG_WORD", + * ... "cow" + * ... ]}) + * > df.select(pl.col('foo').str.padStart("_", 3) + * shape: (4, 1) + * ┌──────────┐ + * │ a │ + * │ -------- │ + * │ str │ + * ╞══════════╡ + * │ __a │ + * ├╌╌╌╌╌╌╌╌╌╌┤ + * │ __b │ + * ├╌╌╌╌╌╌╌╌╌╌┤ + * │ LONG_WORD│ + * ├╌╌╌╌╌╌╌╌╌╌┤ + * │ cow │ + * └──────────┘ + * ``` + */ + padStart(length: number, fillChar: string): Expr; + /** + * Add leading "0" to a string until string length is reached. + * If string is longer or equal to given length no modifications will be done + * @param {number} length - of the final string + * @see {@link padStart} + * * @example + * ``` + * > df = pl.DataFrame({ + * ... 'foo': [ + * ... "a", + * ... "b", + * ... "LONG_WORD", + * ... "cow" + * ... ]}) + * > df.select(pl.col('foo').str.justify(3) + * shape: (4, 1) + * ┌──────────┐ + * │ a │ + * │ -------- │ + * │ str │ + * ╞══════════╡ + * │ 00a │ + * ├╌╌╌╌╌╌╌╌╌╌┤ + * │ 00b │ + * ├╌╌╌╌╌╌╌╌╌╌┤ + * │ LONG_WORD│ + * ├╌╌╌╌╌╌╌╌╌╌┤ + * │ cow │ + * └──────────┘ + * ``` + */ + zFill(length: number): Expr; + /** + * Add a trailing fillChar to a string until string length is reached. + * If string is longer or equal to given length no modifications will be done + * @param {number} length - of the final string + * @param {string} fillChar - that will fill the string. + * @note If a string longer than 1 character is provided only the first character will be used + * * @example + * ``` + * > df = pl.DataFrame({ + * ... 'foo': [ + * ... "a", + * ... "b", + * ... "LONG_WORD", + * ... "cow" + * ... ]}) + * > df.select(pl.col('foo').str.padEnd("_", 3) + * shape: (4, 1) + * ┌──────────┐ + * │ a │ + * │ -------- │ + * │ str │ + * ╞══════════╡ + * │ a__ │ + * ├╌╌╌╌╌╌╌╌╌╌┤ + * │ b__ │ + * ├╌╌╌╌╌╌╌╌╌╌┤ + * │ LONG_WORD│ + * ├╌╌╌╌╌╌╌╌╌╌┤ + * │ cow │ + * └──────────┘ + * ``` + */ + padEnd(length: number, fillChar: string): Expr; /** * Create subslices of the string values of a Utf8 Series. * @param start - Start of the slice (negative indexing may be used). @@ -162,16 +260,16 @@ export interface ExprString { * @param separator — A string that identifies character or characters to use in separating the string. * @param inclusive Include the split character/string in the results */ - split(by: string, options?: {inclusive?: boolean} | boolean): Expr + split(by: string, options?: { inclusive?: boolean } | boolean): Expr; /** Remove leading and trailing whitespace. */ - strip(): Expr + strip(): Expr; /** * Parse a Series of dtype Utf8 to a Date/Datetime Series. * @param datatype Date or Datetime. * @param fmt formatting syntax. [Read more](https://docs.rs/chrono/0.4.19/chrono/format/strftime/index.html) */ - strptime(datatype: DataType.Date, fmt?: string): Expr - strptime(datatype: DataType.Datetime, fmt?: string): Expr + strptime(datatype: DataType.Date, fmt?: string): Expr; + strptime(datatype: DataType.Datetime, fmt?: string): Expr; } export const ExprStringFunctions = (_expr: any): ExprString => { @@ -235,6 +333,15 @@ export const ExprStringFunctions = (_expr: any): ExprString => { rstrip() { return wrap("strRstrip"); }, + padStart(length: number, fillChar: string){ + return wrap("strPadStart", length, fillChar); + }, + zFill(length: number) { + return wrap("strZFill", length); + }, + padEnd(length: number, fillChar: string) { + return wrap("strPadEnd", length, fillChar); + }, slice(start: number, length?: number) { return wrap("strSlice", start, length); }, diff --git a/polars/series/string.ts b/polars/series/string.ts index 0971dbadb..47e37518c 100644 --- a/polars/series/string.ts +++ b/polars/series/string.ts @@ -128,6 +128,72 @@ export interface StringFunctions { lengths(): Series /** Remove leading whitespace. */ lstrip(): Series + /** + * Add a leading fillChar to a string until string length is reached. + * If string is longer or equal to given length no modifications will be done + * @param {number} length - of the final string + * @param {string} fillChar - that will fill the string. + * @note If a string longer than 1 character is provided only the first character will be used + * @example + * ``` + * > df = pl.DataFrame({ + * ... 'foo': [ + * ... "a", + * ... "b", + * ... "LONG_WORD", + * ... "cow" + * ... ]}) + * > df.select(pl.col('foo').str.padStart("_", 3) + * shape: (4, 1) + * ┌──────────┐ + * │ a │ + * │ -------- │ + * │ str │ + * ╞══════════╡ + * │ __a │ + * ├╌╌╌╌╌╌╌╌╌╌┤ + * │ __b │ + * ├╌╌╌╌╌╌╌╌╌╌┤ + * │ LONG_WORD│ + * ├╌╌╌╌╌╌╌╌╌╌┤ + * │ cow │ + * └──────────┘ + * ``` + */ + padStart(length: number, fillChar: string): Series + /** + * Add a leading '0' to a string until string length is reached. + * If string is longer or equal to given length no modifications will be done + * @param {number} length - of the final string + * @example + * ``` + * > df = pl.DataFrame({ + * ... 'foo': [ + * ... "a", + * ... "b", + * ... "LONG_WORD", + * ... "cow" + * ... ]}) + * > df.select(pl.col('foo').str.padStart(3) + * shape: (4, 1) + * ┌──────────┐ + * │ a │ + * │ -------- │ + * │ str │ + * ╞══════════╡ + * │ 00a │ + * ├╌╌╌╌╌╌╌╌╌╌┤ + * │ 00b │ + * ├╌╌╌╌╌╌╌╌╌╌┤ + * │ LONG_WORD│ + * ├╌╌╌╌╌╌╌╌╌╌┤ + * │ cow │ + * └──────────┘ + * ``` + */ + zFill(length: number): Series + /** Add trailing zeros */ + padEnd(length: number, fillChar: string): Series /** * Replace first regex match with a string value. * @param pattern A valid regex pattern @@ -236,6 +302,15 @@ export const StringFunctions = (_s: any): StringFunctions => { lstrip() { return wrap("strReplace", /^\s*/.source, ""); }, + padStart(length: number, fillChar: string) { + return wrap("strPadStart", length, fillChar); + }, + zFill(length: number) { + return wrap("strZFill", length); + }, + padEnd(length: number, fillChar: string) { + return wrap("strPadEnd", length, fillChar); + }, replace(pat: RegExp, val: string) { return wrap("strReplace", regexToString(pat), val); }, diff --git a/src/lazy/dsl.rs b/src/lazy/dsl.rs index 90b4ce737..4bbd34d06 100644 --- a/src/lazy/dsl.rs +++ b/src/lazy/dsl.rs @@ -608,6 +608,50 @@ impl JsExpr { .into() } + #[napi] + pub fn str_pad_start(&self, length: i64, fill_char: String) -> JsExpr { + let function = move |s: Series| { + let ca = s.utf8()?; + Ok(ca + .rjust(length as usize, fill_char.chars().nth(0).unwrap()) + .into_series()) + }; + + self.clone() + .inner + .map(function, GetOutput::from_type(DataType::Utf8)) + .with_fmt("str.pad_start") + .into() + } + + #[napi] + pub fn str_pad_end(&self, length: i64, fill_char: String) -> JsExpr { + let function = move |s: Series| { + let ca = s.utf8()?; + Ok(ca + .ljust(length as usize, fill_char.chars().nth(0).unwrap()) + .into_series()) + }; + + self.clone() + .inner + .map(function, GetOutput::from_type(DataType::Utf8)) + .with_fmt("str.pad_end") + .into() + } + #[napi] + pub fn str_z_fill(&self, width: i64) -> JsExpr { + let function = move |s: Series| { + let ca = s.utf8()?; + Ok(ca.zfill(width as usize).into_series()) + }; + + self.clone() + .inner + .map(function, GetOutput::from_type(DataType::Utf8)) + .with_fmt("str.z_fill") + .into() + } #[napi] pub fn str_to_uppercase(&self) -> JsExpr { let function = |s: Series| { diff --git a/src/series.rs b/src/series.rs index 03db6b65e..d664eae22 100644 --- a/src/series.rs +++ b/src/series.rs @@ -893,7 +893,28 @@ impl JsSeries { .into_series(); Ok(s.into()) } - + #[napi] + pub fn str_pad_start(&self, length: i64, fill_char: String) -> napi::Result { + let ca = self.series.utf8().map_err(JsPolarsErr::from)?; + let s = ca + .rjust(length as usize, fill_char.chars().nth(0).unwrap()) + .into_series(); + Ok(s.into()) + } + #[napi] + pub fn str_pad_end(&self, length: i64, fill_char: String) -> napi::Result { + let ca = self.series.utf8().map_err(JsPolarsErr::from)?; + let s = ca + .ljust(length as usize, fill_char.chars().nth(0).unwrap()) + .into_series(); + Ok(s.into()) + } + #[napi] + pub fn str_z_fill(&self, length: i64) -> napi::Result { + let ca = self.series.utf8().map_err(JsPolarsErr::from)?; + let s = ca.zfill(length as usize).into_series(); + Ok(s.into()) + } #[napi] pub fn strftime(&self, fmt: String) -> napi::Result { let s = self.series.strftime(&fmt).map_err(JsPolarsErr::from)?;