Feature/add str padding methods (pola-rs#41)

* Added option interface to io functions. * Made interfaces PascalCase * Fixed Errors from tests Two tests failing - don't know why * Added padStart, padEnd and justify to lazy str. * Edited jsdoc * Added example * Edit on example * Reformat arguments Reformat justify to zfill * Corrected test to reformat * Added padStart, padEnd and zFill to series * Removed dtype-struct from features * Removed todo * Fixed error * Deleted this * Code formatting
Bidek56 · Jan 29, 2023 · ca8b7b8 · ca8b7b8
1 parent 1a7cef0
commit ca8b7b8
Show file tree

Hide file tree

Showing 7 changed files with 345 additions and 10 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -66,6 +66,7 @@ features = [
   "arange",
   "true_div",
   "dtype-categorical",
+  "string_justify",
   "diagonal_concat",
   "horizontal_concat",
   "abs",

diff --git a/__tests__/expr.test.ts b/__tests__/expr.test.ts
@@ -1190,6 +1190,54 @@ describe("expr.str", () => {
     expect(actual).toFrameEqual(expected);
     expect(seriesActual).toFrameEqual(expected);
   });
+  test("padStart", () => {
+    const df = pl.DataFrame({
+      foo: ["a", "b", "cow", "longer"],
+    });
+    const expected = pl.DataFrame({
+      foo: ["__a", "__b", "cow", "longer"],
+    });
+    const seriesActual = df
+      .getColumn("foo")
+      .str.padStart(3, "_")
+      .rename("foo")
+      .toFrame();
+    const actual = df.select(col("foo").str.padStart(3, "_").as("foo"));
+    expect(actual).toFrameEqual(expected);
+    expect(seriesActual).toFrameEqual(expected);
+  });
+  test("padEnd", () => {
+    const df = pl.DataFrame({
+      foo: ["a", "b", "cow", "longer"],
+    });
+    const expected = pl.DataFrame({
+      foo: ["a__", "b__", "cow", "longer"],
+    });
+    const seriesActual = df
+      .getColumn("foo")
+      .str.padEnd(3, "_")
+      .rename("foo")
+      .toFrame();
+    const actual = df.select(col("foo").str.padEnd(3, "_").as("foo"));
+    expect(actual).toFrameEqual(expected);
+    expect(seriesActual).toFrameEqual(expected);
+  });
+  test("zFill", () => {
+    const df = pl.DataFrame({
+      foo: ["a", "b", "cow", "longer"],
+    });
+    const expected = pl.DataFrame({
+      foo: ["00a", "00b", "cow", "longer"],
+    });
+    const seriesActual = df
+      .getColumn("foo")
+      .str.zFill(3)
+      .rename("foo")
+      .toFrame();
+    const actual = df.select(col("foo").str.zFill(3).as("foo"));
+    expect(actual).toFrameEqual(expected);
+    expect(seriesActual).toFrameEqual(expected);
+  });
   test("hex encode", () => {
     const df = pl.DataFrame({
       original: ["foo", "bar", null],

diff --git a/__tests__/lazyframe.test.ts b/__tests__/lazyframe.test.ts
@@ -1042,4 +1042,43 @@ describe("lazyframe", () => {
     });
     expect(actual).toFrameEqual(expected);
   });
+  test("str:padStart", () => {
+    const actual = pl.DataFrame({
+      "ham": ["a", "b", "c"]
+    }).lazy()
+      .withColumn(
+        pl.col("ham").str.padStart(3, "-")
+      )
+      .collectSync();
+    const expected = pl.DataFrame({
+      "ham": ["--a", "--b", "--c"]
+    });
+    expect(actual).toFrameEqual(expected);
+  });
+  test("str:padEnd", () => {
+    const actual = pl.DataFrame({
+      "ham": ["a", "b", "c"]
+    }).lazy()
+      .withColumn(
+        pl.col("ham").str.padEnd(3, "-")
+      )
+      .collectSync();
+    const expected = pl.DataFrame({
+      "ham": ["a--", "b--", "c--"]
+    });
+    expect(actual).toFrameEqual(expected);
+  });
+  test("str:zFill", () => {
+    const actual = pl.DataFrame({
+      "ham": ["a", "b", "c"]
+    }).lazy()
+      .withColumn(
+        pl.col("ham").str.zFill(3)
+      )
+      .collectSync();
+    const expected = pl.DataFrame({
+      "ham": ["00a", "00b", "00c"]
+    });
+    expect(actual).toFrameEqual(expected);
+  });
 });
diff --git a/polars/lazy/expr/string.ts b/polars/lazy/expr/string.ts
@@ -51,8 +51,8 @@ export interface ExprString {
    * └─────────┘
    * ```
    */
-  decode(encoding: "hex" | "base64", strict?: boolean): Expr
-  decode(options: {encoding: "hex" | "base64", strict?: boolean}): Expr
+  decode(encoding: "hex" | "base64", strict?: boolean): Expr;
+  decode(options: { encoding: "hex" | "base64"; strict?: boolean }): Expr;
   /**
    * Encodes a value using the provided encoding
    * @param encoding - hex | base64
@@ -74,7 +74,7 @@ export interface ExprString {
    * └─────────┘
    * ```
    */
-  encode(encoding: "hex" | "base64"): Expr
+  encode(encoding: "hex" | "base64"): Expr;
   /**
    * Extract the target capture group from provided patterns.
    * @param pattern A valid regex pattern
@@ -140,7 +140,7 @@ export interface ExprString {
   /**  Get length of the string values in the Series. */
   lengths(): Expr;
   /** Remove leading whitespace. */
-  lstrip(): Expr
+  lstrip(): Expr;
   /** Replace first regex match with a string value. */
   replace(pat: string | RegExp, val: string): Expr;
   /** Replace all regex matches with a string value. */
@@ -150,7 +150,105 @@ export interface ExprString {
   /** Modify the strings to their uppercase equivalent. */
   toUpperCase(): Expr;
   /** Remove trailing whitespace. */
-  rstrip(): Expr
+  rstrip(): Expr;
+  /**
+   *  Add a leading fillChar to a string until string length is reached.
+   * If string is longer or equal to given length no modifications will be done
+   * @param {number} length  - of the final string
+   * @param {string} fillChar  - that will fill the string.
+   * @note If a string longer than 1 character is provided only the first character will be used
+   * @example
+   * ```
+   * > df = pl.DataFrame({
+   * ...   'foo': [
+   * ...       "a",
+   * ...       "b",
+   * ...       "LONG_WORD",
+   * ...       "cow"
+   * ...   ]})
+   * > df.select(pl.col('foo').str.padStart("_", 3)
+   * shape: (4, 1)
+   * ┌──────────┐
+   * │ a        │
+   * │ -------- │
+   * │ str      │
+   * ╞══════════╡
+   * │ __a      │
+   * ├╌╌╌╌╌╌╌╌╌╌┤
+   * │ __b      │
+   * ├╌╌╌╌╌╌╌╌╌╌┤
+   * │ LONG_WORD│
+   * ├╌╌╌╌╌╌╌╌╌╌┤
+   * │ cow      │
+   * └──────────┘
+   * ```
+   */
+  padStart(length: number, fillChar: string): Expr;
+  /**
+   *  Add  leading "0" to a string until string length is reached.
+   * If string is longer or equal to given length no modifications will be done
+   * @param {number} length  - of the final string
+   * @see {@link padStart}
+   *    * @example
+   * ```
+   * > df = pl.DataFrame({
+   * ...   'foo': [
+   * ...       "a",
+   * ...       "b",
+   * ...       "LONG_WORD",
+   * ...       "cow"
+   * ...   ]})
+   * > df.select(pl.col('foo').str.justify(3)
+   * shape: (4, 1)
+   * ┌──────────┐
+   * │ a        │
+   * │ -------- │
+   * │ str      │
+   * ╞══════════╡
+   * │ 00a      │
+   * ├╌╌╌╌╌╌╌╌╌╌┤
+   * │ 00b      │
+   * ├╌╌╌╌╌╌╌╌╌╌┤
+   * │ LONG_WORD│
+   * ├╌╌╌╌╌╌╌╌╌╌┤
+   * │ cow      │
+   * └──────────┘
+   * ```
+   */
+  zFill(length: number): Expr;
+  /**
+   *  Add a trailing fillChar to a string until string length is reached.
+   * If string is longer or equal to given length no modifications will be done
+   * @param {number} length  - of the final string
+   * @param {string} fillChar  - that will fill the string.
+   * @note If a string longer than 1 character is provided only the first character will be used
+   *    * @example
+   * ```
+   * > df = pl.DataFrame({
+   * ...   'foo': [
+   * ...       "a",
+   * ...       "b",
+   * ...       "LONG_WORD",
+   * ...       "cow"
+   * ...   ]})
+   * > df.select(pl.col('foo').str.padEnd("_", 3)
+   * shape: (4, 1)
+   * ┌──────────┐
+   * │ a        │
+   * │ -------- │
+   * │ str      │
+   * ╞══════════╡
+   * │ a__      │
+   * ├╌╌╌╌╌╌╌╌╌╌┤
+   * │ b__      │
+   * ├╌╌╌╌╌╌╌╌╌╌┤
+   * │ LONG_WORD│
+   * ├╌╌╌╌╌╌╌╌╌╌┤
+   * │ cow      │
+   * └──────────┘
+   * ```
+   */
+  padEnd(length: number, fillChar: string): Expr;
   /**
    * Create subslices of the string values of a Utf8 Series.
    * @param start - Start of the slice (negative indexing may be used).
@@ -162,16 +260,16 @@ export interface ExprString {
    * @param separator — A string that identifies character or characters to use in separating the string.
    * @param inclusive Include the split character/string in the results
    */
-  split(by: string, options?: {inclusive?: boolean} | boolean): Expr
+  split(by: string, options?: { inclusive?: boolean } | boolean): Expr;
   /** Remove leading and trailing whitespace. */
-  strip(): Expr
+  strip(): Expr;
   /**
    * Parse a Series of dtype Utf8 to a Date/Datetime Series.
    * @param datatype Date or Datetime.
    * @param fmt formatting syntax. [Read more](https://docs.rs/chrono/0.4.19/chrono/format/strftime/index.html)
    */
-  strptime(datatype: DataType.Date, fmt?: string): Expr
-  strptime(datatype: DataType.Datetime, fmt?: string): Expr
+  strptime(datatype: DataType.Date, fmt?: string): Expr;
+  strptime(datatype: DataType.Datetime, fmt?: string): Expr;
 }
 
 export const ExprStringFunctions = (_expr: any): ExprString => {
@@ -235,6 +333,15 @@ export const ExprStringFunctions = (_expr: any): ExprString => {
     rstrip() {
       return wrap("strRstrip");
     },
+    padStart(length: number, fillChar: string){
+      return wrap("strPadStart", length, fillChar);
+    },
+    zFill(length: number) {
+      return wrap("strZFill", length);
+    },
+    padEnd(length: number, fillChar: string) {
+      return wrap("strPadEnd", length, fillChar);
+    },
     slice(start: number, length?: number) {
       return wrap("strSlice", start, length);
     },

diff --git a/polars/series/string.ts b/polars/series/string.ts
@@ -128,6 +128,72 @@ export interface StringFunctions {
   lengths(): Series
   /** Remove leading whitespace. */
   lstrip(): Series
+    /**
+   *  Add a leading fillChar to a string until string length is reached.
+   * If string is longer or equal to given length no modifications will be done
+   * @param {number} length  - of the final string
+   * @param {string} fillChar  - that will fill the string.
+   * @note If a string longer than 1 character is provided only the first character will be used
+   * @example
+   * ```
+   * > df = pl.DataFrame({
+   * ...   'foo': [
+   * ...       "a",
+   * ...       "b",
+   * ...       "LONG_WORD",
+   * ...       "cow"
+   * ...   ]})
+   * > df.select(pl.col('foo').str.padStart("_", 3)
+   * shape: (4, 1)
+   * ┌──────────┐
+   * │ a        │
+   * │ -------- │
+   * │ str      │
+   * ╞══════════╡
+   * │ __a      │
+   * ├╌╌╌╌╌╌╌╌╌╌┤
+   * │ __b      │
+   * ├╌╌╌╌╌╌╌╌╌╌┤
+   * │ LONG_WORD│
+   * ├╌╌╌╌╌╌╌╌╌╌┤
+   * │ cow      │
+   * └──────────┘
+   * ```
+  */
+  padStart(length: number, fillChar: string): Series
+  /**
+   *  Add a leading '0' to a string until string length is reached.
+   * If string is longer or equal to given length no modifications will be done
+   * @param {number} length  - of the final string
+   * @example
+   * ```
+   * > df = pl.DataFrame({
+   * ...   'foo': [
+   * ...       "a",
+   * ...       "b",
+   * ...       "LONG_WORD",
+   * ...       "cow"
+   * ...   ]})
+   * > df.select(pl.col('foo').str.padStart(3)
+   * shape: (4, 1)
+   * ┌──────────┐
+   * │ a        │
+   * │ -------- │
+   * │ str      │
+   * ╞══════════╡
+   * │ 00a      │
+   * ├╌╌╌╌╌╌╌╌╌╌┤
+   * │ 00b      │
+   * ├╌╌╌╌╌╌╌╌╌╌┤
+   * │ LONG_WORD│
+   * ├╌╌╌╌╌╌╌╌╌╌┤
+   * │ cow      │
+   * └──────────┘
+   * ```
+  */
+  zFill(length: number): Series
+  /** Add trailing zeros */
+  padEnd(length: number, fillChar: string): Series
   /**
    * Replace first regex match with a string value.
    * @param pattern A valid regex pattern
@@ -236,6 +302,15 @@ export const StringFunctions = (_s: any): StringFunctions => {
     lstrip() {
       return wrap("strReplace", /^\s*/.source, "");
     },
+    padStart(length: number, fillChar: string) {
+      return wrap("strPadStart", length, fillChar);
+    },
+    zFill(length: number) {
+      return wrap("strZFill", length);
+    },
+    padEnd(length: number, fillChar: string) {
+      return wrap("strPadEnd", length, fillChar);
+    },
     replace(pat: RegExp, val: string) {
       return wrap("strReplace", regexToString(pat), val);
     },