diff --git a/data-science/model/recsys.ipynb b/data-science/model/recsys.ipynb new file mode 100644 index 0000000..577f67c --- /dev/null +++ b/data-science/model/recsys.ipynb @@ -0,0 +1,600 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "382f9667-f39c-4766-9f56-ddf936941cec", + "metadata": {}, + "outputs": [], + "source": [ + "# импортируем необходимые библиотеки\n", + "import pandas as pd\n", + "from scipy.sparse import csr_matrix\n", + "from implicit.cpu.als import AlternatingLeastSquares\n", + "from implicit.evaluation import mean_average_precision_at_k" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "474de742-7219-4777-9afa-8ae2b268aa45", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv('interactions.csv', parse_dates=['last_watch_dt'])" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "c712c399-8c1f-4394-a03a-97a19cbacd2b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_iditem_idlast_watch_dttotal_durwatched_pct
017654995062021-05-11425072.0
169931716592021-05-298317100.0
265668371072021-05-09100.0
386461376382021-07-0514483100.0
496486895062021-04-306725100.0
..................
5476246648596122252021-08-13760.0
547624754686296732021-04-13230849.0
5476248697262152972021-08-201830763.0
5476249384202161972021-04-196203100.0
547625031970944362021-08-15392145.0
\n", + "

5476251 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " user_id item_id last_watch_dt total_dur watched_pct\n", + "0 176549 9506 2021-05-11 4250 72.0\n", + "1 699317 1659 2021-05-29 8317 100.0\n", + "2 656683 7107 2021-05-09 10 0.0\n", + "3 864613 7638 2021-07-05 14483 100.0\n", + "4 964868 9506 2021-04-30 6725 100.0\n", + "... ... ... ... ... ...\n", + "5476246 648596 12225 2021-08-13 76 0.0\n", + "5476247 546862 9673 2021-04-13 2308 49.0\n", + "5476248 697262 15297 2021-08-20 18307 63.0\n", + "5476249 384202 16197 2021-04-19 6203 100.0\n", + "5476250 319709 4436 2021-08-15 3921 45.0\n", + "\n", + "[5476251 rows x 5 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "5af68c1f-a8be-46c2-ab70-4cbec16284a5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 5476251 entries, 0 to 5476250\n", + "Data columns (total 5 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 user_id 5476251 non-null int64 \n", + " 1 item_id 5476251 non-null int64 \n", + " 2 last_watch_dt 5476251 non-null datetime64[ns]\n", + " 3 total_dur 5476251 non-null int64 \n", + " 4 watched_pct 5475423 non-null float64 \n", + "dtypes: datetime64[ns](1), float64(1), int64(3)\n", + "memory usage: 208.9 MB\n" + ] + } + ], + "source": [ + "df.info(show_counts=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "2271d643-6004-4b77-814e-6fe0f16e4e69", + "metadata": {}, + "outputs": [], + "source": [ + "df['watched_pct'] = df['watched_pct'].fillna(0) #заполним пропущенные значения нулями" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "510bf1f1-28b4-442e-8dbe-f714ec2ea43f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(Timestamp('2021-03-13 00:00:00'), Timestamp('2021-08-22 00:00:00'))" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['last_watch_dt'].min(), df['last_watch_dt'].max()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "cb768800-08a8-4e77-816f-ca6f7b719f0d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_iditem_idlast_watch_dttotal_durwatched_pct
017654995062021-05-11425072.0
169931716592021-05-298317100.0
265668371072021-05-09100.0
386461376382021-07-0514483100.0
496486895062021-04-306725100.0
..................
547624578673248802021-05-127530.0
5476246648596122252021-08-13760.0
547624754686296732021-04-13230849.0
5476249384202161972021-04-196203100.0
547625031970944362021-08-15392145.0
\n", + "

5051815 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " user_id item_id last_watch_dt total_dur watched_pct\n", + "0 176549 9506 2021-05-11 4250 72.0\n", + "1 699317 1659 2021-05-29 8317 100.0\n", + "2 656683 7107 2021-05-09 10 0.0\n", + "3 864613 7638 2021-07-05 14483 100.0\n", + "4 964868 9506 2021-04-30 6725 100.0\n", + "... ... ... ... ... ...\n", + "5476245 786732 4880 2021-05-12 753 0.0\n", + "5476246 648596 12225 2021-08-13 76 0.0\n", + "5476247 546862 9673 2021-04-13 2308 49.0\n", + "5476249 384202 16197 2021-04-19 6203 100.0\n", + "5476250 319709 4436 2021-08-15 3921 45.0\n", + "\n", + "[5051815 rows x 5 columns]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#разделим данные на обучающую и тестовую выборку - для теста возьмем последние 7 дней\n", + "test = df[df['last_watch_dt'] > '2021-08-15']\n", + "train = df[df['last_watch_dt'] <= '2021-08-15']\n", + "train" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "893fd034-c28a-47e8-a8d7-38a27f17991c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "user_id 906071\n", + "item_id 15577\n", + "last_watch_dt 156\n", + "total_dur 126663\n", + "watched_pct 101\n", + "dtype: int64" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train.nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "0be849ba-66fb-4a28-ab63-81fb740c38e9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "user_id 167348\n", + "item_id 7106\n", + "last_watch_dt 7\n", + "total_dur 38328\n", + "watched_pct 101\n", + "dtype: int64" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test.nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "a9253f90-8245-449c-a119-d68faba82874", + "metadata": {}, + "outputs": [], + "source": [ + "#предположим, что процент времени просмотра линейно связан с вероятностью того, что контент понравился\n", + "\n", + "train_df = train[['user_id', 'item_id', 'watched_pct']].copy()\n", + "test_df = test[['user_id', 'item_id', 'watched_pct']].copy()\n", + "\n", + "train_df['watched_pct'] = train_df['watched_pct'] / 100\n", + "test_df['watched_pct'] = test_df['watched_pct'] / 100\n", + "\n", + "#создадим разреженные матрицы взаимодействий users-items для обучающей и тестовой выборок\n", + "\n", + "userid = list(train_df['user_id'].unique())\n", + "itemid = list(train_df['item_id'].unique())\n", + "data = train_df['watched_pct'].tolist()\n", + "row = pd.Categorical(train_df['user_id'], categories=userid, ordered=True).codes\n", + "col = pd.Categorical(train_df['item_id'], categories=itemid, ordered=True).codes\n", + "train_matrix = csr_matrix((data, (row, col)), shape=(len(userid), len(itemid)))\n", + "\n", + "userid = list(test_df['user_id'].unique())\n", + "itemid = list(test_df['item_id'].unique())\n", + "data = test_df['watched_pct'].tolist()\n", + "row = pd.Categorical(test_df['user_id'], categories=userid, ordered=True).codes\n", + "col = pd.Categorical(test_df['item_id'], categories=itemid, ordered=True).codes\n", + "test_matrix = csr_matrix((data, (row, col)), shape=(len(userid), len(itemid)))" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "289cbc71-933d-400f-88c0-43144cf15977", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(<906071x15577 sparse matrix of type ''\n", + " \twith 5051815 stored elements in Compressed Sparse Row format>,\n", + " <167348x7106 sparse matrix of type ''\n", + " \twith 424436 stored elements in Compressed Sparse Row format>)" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_matrix, test_matrix" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "54cd1509-5a95-4ce1-9048-c0a9f2fe253a", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "9e3828e203024fd585b547eab0c55daa", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/15 [00:00