diff --git a/transforms/README-list.md b/transforms/README-list.md index beeefd8d3..7e4d70ee2 100644 --- a/transforms/README-list.md +++ b/transforms/README-list.md @@ -43,6 +43,7 @@ Note: This list includes the transforms that were part of the release starting w ### 1.0.0.a5 Added Pii Redactor + Relax fasttext requirement >= 0.9.2 ### 1.0.0.a4 Added missing ray implementation for lang_id, doc_quality, tokenization and filter Added ray notebooks for lang id, Doc Quality, tokenization, and Filter diff --git a/transforms/language/lang_id/lang_id.ipynb b/transforms/language/lang_id/lang_id.ipynb index 4d27f24b1..5991e5891 100644 --- a/transforms/language/lang_id/lang_id.ipynb +++ b/transforms/language/lang_id/lang_id.ipynb @@ -15,7 +15,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695", "metadata": {}, "outputs": [], @@ -24,7 +24,8 @@ "## This is here as a reference only\n", "# Users and application developers must use the right tag for the latest from pypi\n", "%pip install data-prep-toolkit\n", - "%pip install 'data-prep-toolkit-transforms[lang_id]'" + "%pip install 'data-prep-toolkit-transforms[lang_id]'\n", + "%pip install pandas" ] }, { @@ -55,7 +56,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "9669273a-8fcc-4b40-9b20-8df658e2ab58", "metadata": {}, "outputs": [], @@ -73,7 +74,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "badafb96-64d2-4bb8-9f3e-b23713fd5c3f", "metadata": {}, "outputs": [ @@ -81,21 +82,20 @@ "name": "stderr", "output_type": "stream", "text": [ - "00:06:41 INFO - lang_id parameters are : {'model_credential': 'PUT YOUR OWN HUGGINGFACE CREDENTIAL', 'model_kind': 'fasttext', 'model_url': 'facebook/fasttext-language-identification', 'content_column_name': 'text', 'output_lang_column_name': 'lang', 'output_score_column_name': 'score'}\n", - "00:06:41 INFO - pipeline id pipeline_id\n", - "00:06:41 INFO - code location None\n", - "00:06:41 INFO - data factory data_ is using local data access: input_folder - test-data/input output_folder - output\n", - "00:06:41 INFO - data factory data_ max_files -1, n_sample -1\n", - "00:06:41 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "00:06:41 INFO - orchestrator lang_id started at 2024-12-11 00:06:41\n", - "00:06:41 INFO - Number of files is 3, source profile {'max_file_size': 0.3023223876953125, 'min_file_size': 0.037346839904785156, 'total_file_size': 0.4433746337890625}\n", - "Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.\n", - "00:06:47 INFO - Completed 1 files (33.33%) in 0.074 min\n", - "00:06:47 INFO - Completed 2 files (66.67%) in 0.076 min\n", - "00:06:48 INFO - Completed 3 files (100.0%) in 0.081 min\n", - "00:06:48 INFO - Done processing 3 files, waiting for flush() completion.\n", - "00:06:48 INFO - done flushing in 0.0 sec\n", - "00:06:48 INFO - Completed execution in 0.111 min, execution result 0\n" + "10:01:42 INFO - lang_id parameters are : {'model_credential': 'PUT YOUR OWN HUGGINGFACE CREDENTIAL', 'model_kind': 'fasttext', 'model_url': 'facebook/fasttext-language-identification', 'content_column_name': 'text', 'output_lang_column_name': 'lang', 'output_score_column_name': 'score'}\n", + "10:01:42 INFO - pipeline id pipeline_id\n", + "10:01:42 INFO - code location None\n", + "10:01:42 INFO - data factory data_ is using local data access: input_folder - test-data/input output_folder - output\n", + "10:01:42 INFO - data factory data_ max_files -1, n_sample -1\n", + "10:01:42 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "10:01:42 INFO - orchestrator lang_id started at 2025-01-17 10:01:42\n", + "10:01:42 INFO - Number of files is 3, source profile {'max_file_size': 0.3023223876953125, 'min_file_size': 0.037346839904785156, 'total_file_size': 0.4433746337890625}\n", + "10:01:43 INFO - Completed 1 files (33.33%) in 0.009 min\n", + "10:01:44 INFO - Completed 2 files (66.67%) in 0.011 min\n", + "10:01:44 INFO - Completed 3 files (100.0%) in 0.013 min\n", + "10:01:44 INFO - Done processing 3 files, waiting for flush() completion.\n", + "10:01:44 INFO - done flushing in 0.0 sec\n", + "10:01:44 INFO - Completed execution in 0.024 min, execution result 0\n" ] }, { @@ -104,7 +104,7 @@ "0" ] }, - "execution_count": 2, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -128,7 +128,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "id": "7276fe84-6512-4605-ab65-747351e13a7c", "metadata": {}, "outputs": [ @@ -141,7 +141,7 @@ " 'output/test_01.parquet']" ] }, - "execution_count": 3, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -153,9 +153,165 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "845a75cf-f4a9-467d-87fa-ccbac1c9beb8", "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + " | text | \n", + "count() | \n", + "lang | \n", + "score | \n", + "
---|---|---|---|---|
0 | \n", + "- Notice of name-email change.doc | \n", + "6 | \n", + "en | \n", + "0.858 | \n", + "
1 | \n", + "- Nov13ENAOnly.doc | \n", + "2 | \n", + "de | \n", + "0.264 | \n", + "
2 | \n", + "- OHIO_C~1.XLS | \n", + "2 | \n", + "de | \n", + "0.603 | \n", + "
3 | \n", + "- Oneok(5-30)final.doc | \n", + "1 | \n", + "vi | \n", + "0.152 | \n", + "
4 | \n", + "- OpeningBrief.doc | \n", + "6 | \n", + "ko-Hang | \n", + "0.365 | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
195 | \n", + "- invite.doc | \n", + "2 | \n", + "ro | \n", + "0.717 | \n", + "
196 | \n", + "- issues wrt portland and calgary signing shor... | \n", + "2 | \n", + "en | \n", + "0.997 | \n", + "
197 | \n", + "- jan3102.XLS | \n", + "2 | \n", + "de | \n", + "0.399 | \n", + "
198 | \n", + "- job market.gif | \n", + "2 | \n", + "en | \n", + "0.791 | \n", + "
199 | \n", + "- kick~1.mpe | \n", + "4 | \n", + "eo | \n", + "0.253 | \n", + "
200 rows × 4 columns
\n", + "