Skip to content

Commit

Permalink
Feature/add str padding methods (pola-rs#41)
Browse files Browse the repository at this point in the history
* Added option interface to io functions.

* Made interfaces PascalCase

* Fixed Errors from tests
Two tests failing - don't know why

* Added padStart, padEnd and justify to lazy str.

* Edited jsdoc

* Added example

* Edit on example

* Reformat arguments
Reformat justify to zfill

* Corrected test to reformat

* Added padStart, padEnd and zFill to series

* Removed dtype-struct from features

* Removed todo

* Fixed error

* Deleted this

* Code formatting
  • Loading branch information
cojmeister authored Jan 29, 2023
1 parent 1a7cef0 commit ca8b7b8
Show file tree
Hide file tree
Showing 7 changed files with 345 additions and 10 deletions.
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ features = [
"arange",
"true_div",
"dtype-categorical",
"string_justify",
"diagonal_concat",
"horizontal_concat",
"abs",
Expand Down
48 changes: 48 additions & 0 deletions __tests__/expr.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1190,6 +1190,54 @@ describe("expr.str", () => {
expect(actual).toFrameEqual(expected);
expect(seriesActual).toFrameEqual(expected);
});
test("padStart", () => {
const df = pl.DataFrame({
foo: ["a", "b", "cow", "longer"],
});
const expected = pl.DataFrame({
foo: ["__a", "__b", "cow", "longer"],
});
const seriesActual = df
.getColumn("foo")
.str.padStart(3, "_")
.rename("foo")
.toFrame();
const actual = df.select(col("foo").str.padStart(3, "_").as("foo"));
expect(actual).toFrameEqual(expected);
expect(seriesActual).toFrameEqual(expected);
});
test("padEnd", () => {
const df = pl.DataFrame({
foo: ["a", "b", "cow", "longer"],
});
const expected = pl.DataFrame({
foo: ["a__", "b__", "cow", "longer"],
});
const seriesActual = df
.getColumn("foo")
.str.padEnd(3, "_")
.rename("foo")
.toFrame();
const actual = df.select(col("foo").str.padEnd(3, "_").as("foo"));
expect(actual).toFrameEqual(expected);
expect(seriesActual).toFrameEqual(expected);
});
test("zFill", () => {
const df = pl.DataFrame({
foo: ["a", "b", "cow", "longer"],
});
const expected = pl.DataFrame({
foo: ["00a", "00b", "cow", "longer"],
});
const seriesActual = df
.getColumn("foo")
.str.zFill(3)
.rename("foo")
.toFrame();
const actual = df.select(col("foo").str.zFill(3).as("foo"));
expect(actual).toFrameEqual(expected);
expect(seriesActual).toFrameEqual(expected);
});
test("hex encode", () => {
const df = pl.DataFrame({
original: ["foo", "bar", null],
Expand Down
39 changes: 39 additions & 0 deletions __tests__/lazyframe.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1042,4 +1042,43 @@ describe("lazyframe", () => {
});
expect(actual).toFrameEqual(expected);
});
test("str:padStart", () => {
const actual = pl.DataFrame({
"ham": ["a", "b", "c"]
}).lazy()
.withColumn(
pl.col("ham").str.padStart(3, "-")
)
.collectSync();
const expected = pl.DataFrame({
"ham": ["--a", "--b", "--c"]
});
expect(actual).toFrameEqual(expected);
});
test("str:padEnd", () => {
const actual = pl.DataFrame({
"ham": ["a", "b", "c"]
}).lazy()
.withColumn(
pl.col("ham").str.padEnd(3, "-")
)
.collectSync();
const expected = pl.DataFrame({
"ham": ["a--", "b--", "c--"]
});
expect(actual).toFrameEqual(expected);
});
test("str:zFill", () => {
const actual = pl.DataFrame({
"ham": ["a", "b", "c"]
}).lazy()
.withColumn(
pl.col("ham").str.zFill(3)
)
.collectSync();
const expected = pl.DataFrame({
"ham": ["00a", "00b", "00c"]
});
expect(actual).toFrameEqual(expected);
});
});
125 changes: 116 additions & 9 deletions polars/lazy/expr/string.ts
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,8 @@ export interface ExprString {
* └─────────┘
* ```
*/
decode(encoding: "hex" | "base64", strict?: boolean): Expr
decode(options: {encoding: "hex" | "base64", strict?: boolean}): Expr
decode(encoding: "hex" | "base64", strict?: boolean): Expr;
decode(options: { encoding: "hex" | "base64"; strict?: boolean }): Expr;
/**
* Encodes a value using the provided encoding
* @param encoding - hex | base64
Expand All @@ -74,7 +74,7 @@ export interface ExprString {
* └─────────┘
* ```
*/
encode(encoding: "hex" | "base64"): Expr
encode(encoding: "hex" | "base64"): Expr;
/**
* Extract the target capture group from provided patterns.
* @param pattern A valid regex pattern
Expand Down Expand Up @@ -140,7 +140,7 @@ export interface ExprString {
/** Get length of the string values in the Series. */
lengths(): Expr;
/** Remove leading whitespace. */
lstrip(): Expr
lstrip(): Expr;
/** Replace first regex match with a string value. */
replace(pat: string | RegExp, val: string): Expr;
/** Replace all regex matches with a string value. */
Expand All @@ -150,7 +150,105 @@ export interface ExprString {
/** Modify the strings to their uppercase equivalent. */
toUpperCase(): Expr;
/** Remove trailing whitespace. */
rstrip(): Expr
rstrip(): Expr;
/**
* Add a leading fillChar to a string until string length is reached.
* If string is longer or equal to given length no modifications will be done
* @param {number} length - of the final string
* @param {string} fillChar - that will fill the string.
* @note If a string longer than 1 character is provided only the first character will be used
* @example
* ```
* > df = pl.DataFrame({
* ... 'foo': [
* ... "a",
* ... "b",
* ... "LONG_WORD",
* ... "cow"
* ... ]})
* > df.select(pl.col('foo').str.padStart("_", 3)
* shape: (4, 1)
* ┌──────────┐
* │ a │
* │ -------- │
* │ str │
* ╞══════════╡
* │ __a │
* ├╌╌╌╌╌╌╌╌╌╌┤
* │ __b │
* ├╌╌╌╌╌╌╌╌╌╌┤
* │ LONG_WORD│
* ├╌╌╌╌╌╌╌╌╌╌┤
* │ cow │
* └──────────┘
* ```
*/
padStart(length: number, fillChar: string): Expr;
/**
* Add leading "0" to a string until string length is reached.
* If string is longer or equal to given length no modifications will be done
* @param {number} length - of the final string
* @see {@link padStart}
* * @example
* ```
* > df = pl.DataFrame({
* ... 'foo': [
* ... "a",
* ... "b",
* ... "LONG_WORD",
* ... "cow"
* ... ]})
* > df.select(pl.col('foo').str.justify(3)
* shape: (4, 1)
* ┌──────────┐
* │ a │
* │ -------- │
* │ str │
* ╞══════════╡
* │ 00a │
* ├╌╌╌╌╌╌╌╌╌╌┤
* │ 00b │
* ├╌╌╌╌╌╌╌╌╌╌┤
* │ LONG_WORD│
* ├╌╌╌╌╌╌╌╌╌╌┤
* │ cow │
* └──────────┘
* ```
*/
zFill(length: number): Expr;
/**
* Add a trailing fillChar to a string until string length is reached.
* If string is longer or equal to given length no modifications will be done
* @param {number} length - of the final string
* @param {string} fillChar - that will fill the string.
* @note If a string longer than 1 character is provided only the first character will be used
* * @example
* ```
* > df = pl.DataFrame({
* ... 'foo': [
* ... "a",
* ... "b",
* ... "LONG_WORD",
* ... "cow"
* ... ]})
* > df.select(pl.col('foo').str.padEnd("_", 3)
* shape: (4, 1)
* ┌──────────┐
* │ a │
* │ -------- │
* │ str │
* ╞══════════╡
* │ a__ │
* ├╌╌╌╌╌╌╌╌╌╌┤
* │ b__ │
* ├╌╌╌╌╌╌╌╌╌╌┤
* │ LONG_WORD│
* ├╌╌╌╌╌╌╌╌╌╌┤
* │ cow │
* └──────────┘
* ```
*/
padEnd(length: number, fillChar: string): Expr;
/**
* Create subslices of the string values of a Utf8 Series.
* @param start - Start of the slice (negative indexing may be used).
Expand All @@ -162,16 +260,16 @@ export interface ExprString {
* @param separator — A string that identifies character or characters to use in separating the string.
* @param inclusive Include the split character/string in the results
*/
split(by: string, options?: {inclusive?: boolean} | boolean): Expr
split(by: string, options?: { inclusive?: boolean } | boolean): Expr;
/** Remove leading and trailing whitespace. */
strip(): Expr
strip(): Expr;
/**
* Parse a Series of dtype Utf8 to a Date/Datetime Series.
* @param datatype Date or Datetime.
* @param fmt formatting syntax. [Read more](https://docs.rs/chrono/0.4.19/chrono/format/strftime/index.html)
*/
strptime(datatype: DataType.Date, fmt?: string): Expr
strptime(datatype: DataType.Datetime, fmt?: string): Expr
strptime(datatype: DataType.Date, fmt?: string): Expr;
strptime(datatype: DataType.Datetime, fmt?: string): Expr;
}

export const ExprStringFunctions = (_expr: any): ExprString => {
Expand Down Expand Up @@ -235,6 +333,15 @@ export const ExprStringFunctions = (_expr: any): ExprString => {
rstrip() {
return wrap("strRstrip");
},
padStart(length: number, fillChar: string){
return wrap("strPadStart", length, fillChar);
},
zFill(length: number) {
return wrap("strZFill", length);
},
padEnd(length: number, fillChar: string) {
return wrap("strPadEnd", length, fillChar);
},
slice(start: number, length?: number) {
return wrap("strSlice", start, length);
},
Expand Down
75 changes: 75 additions & 0 deletions polars/series/string.ts
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,72 @@ export interface StringFunctions {
lengths(): Series
/** Remove leading whitespace. */
lstrip(): Series
/**
* Add a leading fillChar to a string until string length is reached.
* If string is longer or equal to given length no modifications will be done
* @param {number} length - of the final string
* @param {string} fillChar - that will fill the string.
* @note If a string longer than 1 character is provided only the first character will be used
* @example
* ```
* > df = pl.DataFrame({
* ... 'foo': [
* ... "a",
* ... "b",
* ... "LONG_WORD",
* ... "cow"
* ... ]})
* > df.select(pl.col('foo').str.padStart("_", 3)
* shape: (4, 1)
* ┌──────────┐
* │ a │
* │ -------- │
* │ str │
* ╞══════════╡
* │ __a │
* ├╌╌╌╌╌╌╌╌╌╌┤
* │ __b │
* ├╌╌╌╌╌╌╌╌╌╌┤
* │ LONG_WORD│
* ├╌╌╌╌╌╌╌╌╌╌┤
* │ cow │
* └──────────┘
* ```
*/
padStart(length: number, fillChar: string): Series
/**
* Add a leading '0' to a string until string length is reached.
* If string is longer or equal to given length no modifications will be done
* @param {number} length - of the final string
* @example
* ```
* > df = pl.DataFrame({
* ... 'foo': [
* ... "a",
* ... "b",
* ... "LONG_WORD",
* ... "cow"
* ... ]})
* > df.select(pl.col('foo').str.padStart(3)
* shape: (4, 1)
* ┌──────────┐
* │ a │
* │ -------- │
* │ str │
* ╞══════════╡
* │ 00a │
* ├╌╌╌╌╌╌╌╌╌╌┤
* │ 00b │
* ├╌╌╌╌╌╌╌╌╌╌┤
* │ LONG_WORD│
* ├╌╌╌╌╌╌╌╌╌╌┤
* │ cow │
* └──────────┘
* ```
*/
zFill(length: number): Series
/** Add trailing zeros */
padEnd(length: number, fillChar: string): Series
/**
* Replace first regex match with a string value.
* @param pattern A valid regex pattern
Expand Down Expand Up @@ -236,6 +302,15 @@ export const StringFunctions = (_s: any): StringFunctions => {
lstrip() {
return wrap("strReplace", /^\s*/.source, "");
},
padStart(length: number, fillChar: string) {
return wrap("strPadStart", length, fillChar);
},
zFill(length: number) {
return wrap("strZFill", length);
},
padEnd(length: number, fillChar: string) {
return wrap("strPadEnd", length, fillChar);
},
replace(pat: RegExp, val: string) {
return wrap("strReplace", regexToString(pat), val);
},
Expand Down
Loading

0 comments on commit ca8b7b8

Please sign in to comment.