diff --git a/transforms/README-list.md b/transforms/README-list.md index beeefd8d3..7e4d70ee2 100644 --- a/transforms/README-list.md +++ b/transforms/README-list.md @@ -43,6 +43,7 @@ Note: This list includes the transforms that were part of the release starting w ### 1.0.0.a5 Added Pii Redactor + Relax fasttext requirement >= 0.9.2 ### 1.0.0.a4 Added missing ray implementation for lang_id, doc_quality, tokenization and filter Added ray notebooks for lang id, Doc Quality, tokenization, and Filter diff --git a/transforms/language/lang_id/lang_id.ipynb b/transforms/language/lang_id/lang_id.ipynb index 4d27f24b1..5991e5891 100644 --- a/transforms/language/lang_id/lang_id.ipynb +++ b/transforms/language/lang_id/lang_id.ipynb @@ -15,7 +15,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695", "metadata": {}, "outputs": [], @@ -24,7 +24,8 @@ "## This is here as a reference only\n", "# Users and application developers must use the right tag for the latest from pypi\n", "%pip install data-prep-toolkit\n", - "%pip install 'data-prep-toolkit-transforms[lang_id]'" + "%pip install 'data-prep-toolkit-transforms[lang_id]'\n", + "%pip install pandas" ] }, { @@ -55,7 +56,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "9669273a-8fcc-4b40-9b20-8df658e2ab58", "metadata": {}, "outputs": [], @@ -73,7 +74,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "badafb96-64d2-4bb8-9f3e-b23713fd5c3f", "metadata": {}, "outputs": [ @@ -81,21 +82,20 @@ "name": "stderr", "output_type": "stream", "text": [ - "00:06:41 INFO - lang_id parameters are : {'model_credential': 'PUT YOUR OWN HUGGINGFACE CREDENTIAL', 'model_kind': 'fasttext', 'model_url': 'facebook/fasttext-language-identification', 'content_column_name': 'text', 'output_lang_column_name': 'lang', 'output_score_column_name': 'score'}\n", - "00:06:41 INFO - pipeline id pipeline_id\n", - "00:06:41 INFO - code location None\n", - "00:06:41 INFO - data factory data_ is using local data access: input_folder - test-data/input output_folder - output\n", - "00:06:41 INFO - data factory data_ max_files -1, n_sample -1\n", - "00:06:41 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "00:06:41 INFO - orchestrator lang_id started at 2024-12-11 00:06:41\n", - "00:06:41 INFO - Number of files is 3, source profile {'max_file_size': 0.3023223876953125, 'min_file_size': 0.037346839904785156, 'total_file_size': 0.4433746337890625}\n", - "Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.\n", - "00:06:47 INFO - Completed 1 files (33.33%) in 0.074 min\n", - "00:06:47 INFO - Completed 2 files (66.67%) in 0.076 min\n", - "00:06:48 INFO - Completed 3 files (100.0%) in 0.081 min\n", - "00:06:48 INFO - Done processing 3 files, waiting for flush() completion.\n", - "00:06:48 INFO - done flushing in 0.0 sec\n", - "00:06:48 INFO - Completed execution in 0.111 min, execution result 0\n" + "10:01:42 INFO - lang_id parameters are : {'model_credential': 'PUT YOUR OWN HUGGINGFACE CREDENTIAL', 'model_kind': 'fasttext', 'model_url': 'facebook/fasttext-language-identification', 'content_column_name': 'text', 'output_lang_column_name': 'lang', 'output_score_column_name': 'score'}\n", + "10:01:42 INFO - pipeline id pipeline_id\n", + "10:01:42 INFO - code location None\n", + "10:01:42 INFO - data factory data_ is using local data access: input_folder - test-data/input output_folder - output\n", + "10:01:42 INFO - data factory data_ max_files -1, n_sample -1\n", + "10:01:42 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "10:01:42 INFO - orchestrator lang_id started at 2025-01-17 10:01:42\n", + "10:01:42 INFO - Number of files is 3, source profile {'max_file_size': 0.3023223876953125, 'min_file_size': 0.037346839904785156, 'total_file_size': 0.4433746337890625}\n", + "10:01:43 INFO - Completed 1 files (33.33%) in 0.009 min\n", + "10:01:44 INFO - Completed 2 files (66.67%) in 0.011 min\n", + "10:01:44 INFO - Completed 3 files (100.0%) in 0.013 min\n", + "10:01:44 INFO - Done processing 3 files, waiting for flush() completion.\n", + "10:01:44 INFO - done flushing in 0.0 sec\n", + "10:01:44 INFO - Completed execution in 0.024 min, execution result 0\n" ] }, { @@ -104,7 +104,7 @@ "0" ] }, - "execution_count": 2, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -128,7 +128,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "id": "7276fe84-6512-4605-ab65-747351e13a7c", "metadata": {}, "outputs": [ @@ -141,7 +141,7 @@ " 'output/test_01.parquet']" ] }, - "execution_count": 3, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -153,9 +153,165 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "845a75cf-f4a9-467d-87fa-ccbac1c9beb8", "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
textcount()langscore
0- Notice of name-email change.doc6en0.858
1- Nov13ENAOnly.doc2de0.264
2- OHIO_C~1.XLS2de0.603
3- Oneok(5-30)final.doc1vi0.152
4- OpeningBrief.doc6ko-Hang0.365
...............
195- invite.doc2ro0.717
196- issues wrt portland and calgary signing shor...2en0.997
197- jan3102.XLS2de0.399
198- job market.gif2en0.791
199- kick~1.mpe4eo0.253
\n", + "

200 rows × 4 columns

\n", + "
" + ], + "text/plain": [ + " text count() lang \\\n", + "0 - Notice of name-email change.doc 6 en \n", + "1 - Nov13ENAOnly.doc 2 de \n", + "2 - OHIO_C~1.XLS 2 de \n", + "3 - Oneok(5-30)final.doc 1 vi \n", + "4 - OpeningBrief.doc 6 ko-Hang \n", + ".. ... ... ... \n", + "195 - invite.doc 2 ro \n", + "196 - issues wrt portland and calgary signing shor... 2 en \n", + "197 - jan3102.XLS 2 de \n", + "198 - job market.gif 2 en \n", + "199 - kick~1.mpe 4 eo \n", + "\n", + " score \n", + "0 0.858 \n", + "1 0.264 \n", + "2 0.603 \n", + "3 0.152 \n", + "4 0.365 \n", + ".. ... \n", + "195 0.717 \n", + "196 0.997 \n", + "197 0.399 \n", + "198 0.791 \n", + "199 0.253 \n", + "\n", + "[200 rows x 4 columns]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "pd.read_parquet('output/test_01.parquet', engine='pyarrow')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7aef6ac9-96cf-40ad-a472-b5d9036436e5", + "metadata": {}, "outputs": [], "source": [] } diff --git a/transforms/language/lang_id/requirements.txt b/transforms/language/lang_id/requirements.txt index df0ffd37e..7f9d2f5d3 100644 --- a/transforms/language/lang_id/requirements.txt +++ b/transforms/language/lang_id/requirements.txt @@ -1,4 +1,4 @@ -fasttext==0.9.2 ; platform_system != "Windows" +fasttext>=0.9.2 ; platform_system != "Windows" langcodes>=3.3.0 huggingface-hub >= 0.21.4, <1.0.0 numpy==1.26.4