From 57e33008c373ed45c6e54c703da005fb474e9b3f Mon Sep 17 00:00:00 2001 From: Jacob Celestine Date: Mon, 27 Jan 2020 14:04:02 +0530 Subject: [PATCH] Added colab and pyspark notebook --- pyspark/Colab and PySpark.ipynb | 2822 +++++++++++++++++++++++++++++++ 1 file changed, 2822 insertions(+) create mode 100644 pyspark/Colab and PySpark.ipynb diff --git a/pyspark/Colab and PySpark.ipynb b/pyspark/Colab and PySpark.ipynb new file mode 100644 index 0000000..650d58b --- /dev/null +++ b/pyspark/Colab and PySpark.ipynb @@ -0,0 +1,2822 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This tutorial was made using Google Colab so the code you see here is meant to run on a colab notebook.
\n", + "It goes through basic [PySpark Functions](https://spark.apache.org/docs/latest/api/python/index.html) and a short introduction on how to use [Colab](https://colab.research.google.com/notebooks/basic_features_overview.ipynb).
\n", + "The reason why I used colab is because of its shareability and free GPU. Yeah you read that right. A FREE GPU! In the words of Google:
\n", + "`Colaboratory, or “Colab” for short, is a product from Google Research. Colab allows anybody to write and execute arbitrary python code through the browser, and is especially well suited to machine learning, data analysis and education. More technically, Colab is a hosted Jupyter notebook service that requires no setup to use, while providing free access to computing resources including GPUs.`
\n", + "If you have more questions about colab, [refer this link](https://research.google.com/colaboratory/faq.html)
\n", + "\n", + "All you need is an internet connection to keep a session alive. If you lose the connection you will have to download the datasets again.
\n", + "If you want to view my colab notebook you can do it [here](https://colab.research.google.com/drive/1G894WS7ltIUTusWWmsCnF_zQhQqZCDOc). The viewing experience and readability is much better there.
\n", + "If you want to try out things with this notebook as a base, feel free to download it from my repo [here](https://github.com/jacobceles/jacobceles.github.io) and then use it with jupyter notebook.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "qJoeN3e8_Gzk" + }, + "source": [ + "# Introduction to Google Colab and PySpark" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "_N5-lspH_N8B" + }, + "source": [ + "## Jupyter notebook basics" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "6Ul54hAYyHyd" + }, + "source": [ + "### Code cells" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "colab_type": "code", + "id": "j38beRUTCI5c", + "outputId": "0aba3c8e-8741-411e-c771-9f8440f7b77f" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "6" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "2*3" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "_Jewe_e9CIYa" + }, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "colab_type": "code", + "id": "g8Y7w6_CCIIT", + "outputId": "5b147862-bdac-4388-b08f-fdf353570d09" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Hello!\n" + ] + } + ], + "source": [ + "print(\"Hello!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "VOqLNkRKyUIS" + }, + "source": [ + "### Text cells" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "X6zdrH15_CCW" + }, + "source": [ + "## Access to the shell" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 85 + }, + "colab_type": "code", + "id": "zdO9sjSdEVnr", + "outputId": "84681aa2-a37c-447c-a082-a55071e80796" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[34mBackup DAGs\u001b[m\u001b[m/\r\n", + "Colab_and_PySpark.ipynb\r\n", + "\u001b[34mReference\u001b[m\u001b[m/\r\n", + "~$ctory Audit_AS-IS Process_03142019.docx\r\n" + ] + } + ], + "source": [ + "ls" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "colab_type": "code", + "id": "QF9e3lDDEX3I", + "outputId": "63b0236d-8157-4486-a190-26f15a570726" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "u'/Users/jcele1/Downloads'" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pwd" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "Dd6t0uFzuR4X" + }, + "source": [ + "## Install Spark" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 493 + }, + "colab_type": "code", + "id": "tt7ZS1_wGgjn", + "outputId": "aa62b40f-f9bd-4b58-f0ab-7487914ac06e" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/bin/sh: apt-get: command not found\n", + "/bin/sh: apt-get: command not found\n" + ] + } + ], + "source": [ + "!apt-get update\n", + "!apt-get install openjdk-8-jdk-headless -qq > /dev/null\n", + "!wget -q http://archive.apache.org/dist/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz\n", + "!tar xf spark-2.4.4-bin-hadoop2.7.tgz\n", + "!pip install -q findspark" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "sdOOq4twHN1K" + }, + "outputs": [], + "source": [ + "import os\n", + "os.environ[\"JAVA_HOME\"] = \"/usr/lib/jvm/java-8-openjdk-amd64\"\n", + "os.environ[\"SPARK_HOME\"] = \"/content/spark-2.4.4-bin-hadoop2.7\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "colab_type": "code", + "id": "3ACYMwhgHTYz", + "outputId": "e7e5ce2b-cec9-4537-c327-007ecd69b9d0" + }, + "outputs": [], + "source": [ + "!ls" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 193 + }, + "colab_type": "code", + "id": "wjfF7LLgHZe3", + "outputId": "043fdfeb-38a1-400f-d8a6-1e371962514a" + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n", + "

SparkContext

\n", + "\n", + "

Spark UI

\n", + "\n", + "
\n", + "
Version
\n", + "
v2.4.4
\n", + "
Master
\n", + "
local[*]
\n", + "
AppName
\n", + "
pyspark-shell
\n", + "
\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 4, + "metadata": { + "tags": [] + }, + "output_type": "execute_result" + } + ], + "source": [ + "import findspark\n", + "findspark.init()\n", + "from pyspark import SparkContext\n", + "\n", + "sc = SparkContext.getOrCreate()\n", + "sc" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 216 + }, + "colab_type": "code", + "id": "Gs7fzvxcHfvw", + "outputId": "757dc23b-d3ba-4b67-ead7-e654ef674db3" + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n", + "

SparkSession - in-memory

\n", + " \n", + "
\n", + "

SparkContext

\n", + "\n", + "

Spark UI

\n", + "\n", + "
\n", + "
Version
\n", + "
v2.4.4
\n", + "
Master
\n", + "
local[*]
\n", + "
AppName
\n", + "
pyspark-shell
\n", + "
\n", + "
\n", + " \n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 6, + "metadata": { + "tags": [] + }, + "output_type": "execute_result" + } + ], + "source": [ + "import pyspark\n", + "from pyspark.sql import SparkSession\n", + "spark = SparkSession.builder.getOrCreate() \n", + "spark" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "hmIqq6xPK7m7" + }, + "source": [ + "# Loading Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 204 + }, + "colab_type": "code", + "id": "hQ3zmGACLKlN", + "outputId": "0948af88-5021-45b3-a966-3081c6c3f022" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2020-01-22 04:20:04-- https://data.cityofchicago.org/api/views/w98m-zvie/rows.csv?accessType=DOWNLOAD\n", + "Resolving data.cityofchicago.org (data.cityofchicago.org)... 52.206.140.199, 52.206.68.26, 52.206.140.205\n", + "Connecting to data.cityofchicago.org (data.cityofchicago.org)|52.206.140.199|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: unspecified [text/csv]\n", + "Saving to: ‘rows.csv?accessType=DOWNLOAD’\n", + "\n", + "rows.csv?accessType [ <=> ] 58.56M 3.32MB/s in 18s \n", + "\n", + "2020-01-22 04:20:23 (3.29 MB/s) - ‘rows.csv?accessType=DOWNLOAD’ saved [61404826]\n", + "\n" + ] + } + ], + "source": [ + "# Downloading and preprocessing Chicago's Reported Crime Data\n", + "!wget https://data.cityofchicago.org/api/views/w98m-zvie/rows.csv?accessType=DOWNLOAD" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 51 + }, + "colab_type": "code", + "id": "Wpq2jYvIMOJy", + "outputId": "16ee6f3a-5455-483c-dc06-54c19c55d59a" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "'rows.csv?accessType=DOWNLOAD'\t spark-2.4.4-bin-hadoop2.7\n", + " sample_data\t\t\t spark-2.4.4-bin-hadoop2.7.tgz\n" + ] + } + ], + "source": [ + "!ls" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "uCJX5cwdMS9q" + }, + "outputs": [], + "source": [ + "#Renaming the downloaded file\n", + "!mv rows.csv?accessType=DOWNLOAD reported-crimes.csv" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 204 + }, + "colab_type": "code", + "id": "hz6ALr5mMqZt", + "outputId": "71469489-37e7-4ef6-a483-2332ef0d3cf8" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------+-----------+--------------------+--------------------+----+--------------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+--------+---------+--------+\n", + "| ID|Case Number| Date| Block|IUCR| Primary Type| Description|Location Description|Arrest|Domestic|Beat|District|Ward|Community Area|FBI Code|X Coordinate|Y Coordinate|Year| Updated On|Latitude|Longitude|Location|\n", + "+--------+-----------+--------------------+--------------------+----+--------------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+--------+---------+--------+\n", + "|11955940| JD121140|07/31/2019 09:00:...| 026XX S HALSTED ST|1154| DECEPTIVE PRACTICE|FINANCIAL IDENTIT...| SMALL RETAIL STORE| false| false|0913| 009| 11| 60| 11| null| null|2019|01/20/2020 03:52:...| null| null| null|\n", + "|11956035| JD121288|09/10/2019 12:01:...|070XX S STONY ISL...|1582|OFFENSE INVOLVING...| CHILD PORNOGRAPHY| RESIDENCE| false| false|0332| 003| 5| 43| 17| null| null|2019|01/20/2020 03:52:...| null| null| null|\n", + "|11956045| JD121237|12/15/2019 12:00:...| 069XX S ADA ST|0610| BURGLARY| FORCIBLE ENTRY| RESIDENCE| false| false|0734| 007| 6| 67| 05| null| null|2019|01/20/2020 03:52:...| null| null| null|\n", + "|11956146| JD121344|12/31/2019 12:00:...| 016XX S THROOP ST|0610| BURGLARY| FORCIBLE ENTRY| RESIDENCE| false| false|1233| 012| 25| 31| 05| null| null|2019|01/20/2020 03:52:...| null| null| null|\n", + "|11956304| JD121233|11/27/2019 10:35:...| 002XX W GOETHE ST|0810| THEFT| OVER $500| RESIDENCE| false| false|1821| 018| 2| 8| 06| null| null|2019|01/20/2020 03:52:...| null| null| null|\n", + "+--------+-----------+--------------------+--------------------+----+--------------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+--------+---------+--------+\n", + "only showing top 5 rows\n", + "\n" + ] + } + ], + "source": [ + "df = spark.read.csv('reported-crimes.csv',header=True)\n", + "df.show(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "HgwoX-pfNqQI" + }, + "source": [ + "# Working with the DataFrame API" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "_QwZtWxZRCBn" + }, + "source": [ + "## Viewing Dataframe" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "50LZ3S8_PMg_" + }, + "source": [ + "In Spark, you have a couple of options to view the DataFrame(DF).\n", + "\n", + "\n", + "1. take(3) will return a list of three row objects. \n", + "2. df.collect() will get all of the data from the entire DataFrame . Be careful when using it, because if you have a large data set when you run collect, you can easily crash the driver node. \n", + "3. If you want Spark to print out your DataFrame in a nice format, then try df.show() with the number of rows as paramter. \n", + "\n", + "N.B: The limit function **returns a new DataFrame** by taking the first n rows." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "eFoagdqARKb8" + }, + "source": [ + "## Schema of a DataFrame" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 391 + }, + "colab_type": "code", + "id": "w6qwTjGsNxrw", + "outputId": "fa18fcce-a7d0-41f3-934e-520d51eaadd1" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[('ID', 'string'),\n", + " ('Case Number', 'string'),\n", + " ('Date', 'string'),\n", + " ('Block', 'string'),\n", + " ('IUCR', 'string'),\n", + " ('Primary Type', 'string'),\n", + " ('Description', 'string'),\n", + " ('Location Description', 'string'),\n", + " ('Arrest', 'string'),\n", + " ('Domestic', 'string'),\n", + " ('Beat', 'string'),\n", + " ('District', 'string'),\n", + " ('Ward', 'string'),\n", + " ('Community Area', 'string'),\n", + " ('FBI Code', 'string'),\n", + " ('X Coordinate', 'string'),\n", + " ('Y Coordinate', 'string'),\n", + " ('Year', 'string'),\n", + " ('Updated On', 'string'),\n", + " ('Latitude', 'string'),\n", + " ('Longitude', 'string'),\n", + " ('Location', 'string')]" + ] + }, + "execution_count": 11, + "metadata": { + "tags": [] + }, + "output_type": "execute_result" + } + ], + "source": [ + "df.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 425 + }, + "colab_type": "code", + "id": "CCGTFlCWRPw4", + "outputId": "4d85a813-406b-43bf-fd77-f6df935f3b42" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "root\n", + " |-- ID: string (nullable = true)\n", + " |-- Case Number: string (nullable = true)\n", + " |-- Date: string (nullable = true)\n", + " |-- Block: string (nullable = true)\n", + " |-- IUCR: string (nullable = true)\n", + " |-- Primary Type: string (nullable = true)\n", + " |-- Description: string (nullable = true)\n", + " |-- Location Description: string (nullable = true)\n", + " |-- Arrest: string (nullable = true)\n", + " |-- Domestic: string (nullable = true)\n", + " |-- Beat: string (nullable = true)\n", + " |-- District: string (nullable = true)\n", + " |-- Ward: string (nullable = true)\n", + " |-- Community Area: string (nullable = true)\n", + " |-- FBI Code: string (nullable = true)\n", + " |-- X Coordinate: string (nullable = true)\n", + " |-- Y Coordinate: string (nullable = true)\n", + " |-- Year: string (nullable = true)\n", + " |-- Updated On: string (nullable = true)\n", + " |-- Latitude: string (nullable = true)\n", + " |-- Longitude: string (nullable = true)\n", + " |-- Location: string (nullable = true)\n", + "\n" + ] + } + ], + "source": [ + "df.printSchema()" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 391 + }, + "colab_type": "code", + "id": "xpsaQ4JMRUiS", + "outputId": "f4a5a4c7-b69a-49e3-9609-b8c5720ac2e2" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['ID',\n", + " 'Case Number',\n", + " 'Date',\n", + " 'Block',\n", + " 'IUCR',\n", + " 'Primary Type',\n", + " 'Description',\n", + " 'Location Description',\n", + " 'Arrest',\n", + " 'Domestic',\n", + " 'Beat',\n", + " 'District',\n", + " 'Ward',\n", + " 'Community Area',\n", + " 'FBI Code',\n", + " 'X Coordinate',\n", + " 'Y Coordinate',\n", + " 'Year',\n", + " 'Updated On',\n", + " 'Latitude',\n", + " 'Longitude',\n", + " 'Location']" + ] + }, + "execution_count": 13, + "metadata": { + "tags": [] + }, + "output_type": "execute_result" + } + ], + "source": [ + "# Defining a schema\n", + "from pyspark.sql.types import StructType, StructField, StringType, TimestampType, BooleanType, DoubleType, IntegerType\n", + "df.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "ik62VX34SlFh" + }, + "outputs": [], + "source": [ + "labels = [\n", + " ('ID',StringType()),\n", + " ('Case Number',StringType()),\n", + " ('Date',TimestampType()),\n", + " ('Block',StringType()),\n", + " ('IUCR',StringType()),\n", + " ('Primary Type',StringType()),\n", + " ('Description',StringType()),\n", + " ('Location Description',StringType()),\n", + " ('Arrest',StringType()),\n", + " ('Domestic',BooleanType()),\n", + " ('Beat',StringType()),\n", + " ('District',StringType()),\n", + " ('Ward',StringType()),\n", + " ('Community Area',StringType()),\n", + " ('FBI Code',StringType()),\n", + " ('X Coordinate',StringType()),\n", + " ('Y Coordinate',StringType()),\n", + " ('Year',IntegerType()),\n", + " ('Updated On',StringType()),\n", + " ('Latitude',DoubleType()),\n", + " ('Longitude',DoubleType()),\n", + " ('Location',StringType()),\n", + " ('Historical Wards 2003-2015',StringType()),\n", + " ('Zip Codes',StringType()),\n", + " ('Community Areas',StringType()),\n", + " ('Census Tracts',StringType()),\n", + " ('Wards',StringType()),\n", + " ('Boundaries - ZIP Codes',StringType()),\n", + " ('Police Districts',StringType()),\n", + " ('Police Beats',StringType())\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 54 + }, + "colab_type": "code", + "id": "T-Fp5y_oU9SF", + "outputId": "b60ae500-9efa-463a-965f-b1d37bee6b97" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "StructType(List(StructField(ID,StringType,true),StructField(Case Number,StringType,true),StructField(Date,TimestampType,true),StructField(Block,StringType,true),StructField(IUCR,StringType,true),StructField(Primary Type,StringType,true),StructField(Description,StringType,true),StructField(Location Description,StringType,true),StructField(Arrest,StringType,true),StructField(Domestic,BooleanType,true),StructField(Beat,StringType,true),StructField(District,StringType,true),StructField(Ward,StringType,true),StructField(Community Area,StringType,true),StructField(FBI Code,StringType,true),StructField(X Coordinate,StringType,true),StructField(Y Coordinate,StringType,true),StructField(Year,IntegerType,true),StructField(Updated On,StringType,true),StructField(Latitude,DoubleType,true),StructField(Longitude,DoubleType,true),StructField(Location,StringType,true),StructField(Historical Wards 2003-2015,StringType,true),StructField(Zip Codes,StringType,true),StructField(Community Areas,StringType,true),StructField(Census Tracts,StringType,true),StructField(Wards,StringType,true),StructField(Boundaries - ZIP Codes,StringType,true),StructField(Police Districts,StringType,true),StructField(Police Beats,StringType,true)))" + ] + }, + "execution_count": 15, + "metadata": { + "tags": [] + }, + "output_type": "execute_result" + } + ], + "source": [ + "schema = StructType([StructField (x[0], x[1], True) for x in labels])\n", + "schema" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 561 + }, + "colab_type": "code", + "id": "sgC7gtL5VTls", + "outputId": "55c6dcad-4127-4a49-b5e8-df884baa1023" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "root\n", + " |-- ID: string (nullable = true)\n", + " |-- Case Number: string (nullable = true)\n", + " |-- Date: timestamp (nullable = true)\n", + " |-- Block: string (nullable = true)\n", + " |-- IUCR: string (nullable = true)\n", + " |-- Primary Type: string (nullable = true)\n", + " |-- Description: string (nullable = true)\n", + " |-- Location Description: string (nullable = true)\n", + " |-- Arrest: string (nullable = true)\n", + " |-- Domestic: boolean (nullable = true)\n", + " |-- Beat: string (nullable = true)\n", + " |-- District: string (nullable = true)\n", + " |-- Ward: string (nullable = true)\n", + " |-- Community Area: string (nullable = true)\n", + " |-- FBI Code: string (nullable = true)\n", + " |-- X Coordinate: string (nullable = true)\n", + " |-- Y Coordinate: string (nullable = true)\n", + " |-- Year: integer (nullable = true)\n", + " |-- Updated On: string (nullable = true)\n", + " |-- Latitude: double (nullable = true)\n", + " |-- Longitude: double (nullable = true)\n", + " |-- Location: string (nullable = true)\n", + " |-- Historical Wards 2003-2015: string (nullable = true)\n", + " |-- Zip Codes: string (nullable = true)\n", + " |-- Community Areas: string (nullable = true)\n", + " |-- Census Tracts: string (nullable = true)\n", + " |-- Wards: string (nullable = true)\n", + " |-- Boundaries - ZIP Codes: string (nullable = true)\n", + " |-- Police Districts: string (nullable = true)\n", + " |-- Police Beats: string (nullable = true)\n", + "\n" + ] + } + ], + "source": [ + "df = spark.read.csv('reported-crimes.csv',schema=schema)\n", + "df.printSchema()\n", + "# The schema comes as we gave!" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 479 + }, + "colab_type": "code", + "id": "Dn2EAhesVmx0", + "outputId": "285aaf1c-0946-4c74-d5e1-0e51a701f19e" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+----+-----------+----+-----+----+------------+-----------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+----------+--------+---------+--------+--------------------------+---------+---------------+-------------+-----+----------------------+----------------+------------+\n", + "| ID|Case Number|Date|Block|IUCR|Primary Type|Description|Location Description|Arrest|Domestic|Beat|District|Ward|Community Area|FBI Code|X Coordinate|Y Coordinate|Year|Updated On|Latitude|Longitude|Location|Historical Wards 2003-2015|Zip Codes|Community Areas|Census Tracts|Wards|Boundaries - ZIP Codes|Police Districts|Police Beats|\n", + "+----+-----------+----+-----+----+------------+-----------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+----------+--------+---------+--------+--------------------------+---------+---------------+-------------+-----+----------------------+----------------+------------+\n", + "|null| null|null| null|null| null| null| null| null| null|null| null|null| null| null| null| null|null| null| null| null| null| null| null| null| null| null| null| null| null|\n", + "|null| null|null| null|null| null| null| null| null| null|null| null|null| null| null| null| null|null| null| null| null| null| null| null| null| null| null| null| null| null|\n", + "|null| null|null| null|null| null| null| null| null| null|null| null|null| null| null| null| null|null| null| null| null| null| null| null| null| null| null| null| null| null|\n", + "|null| null|null| null|null| null| null| null| null| null|null| null|null| null| null| null| null|null| null| null| null| null| null| null| null| null| null| null| null| null|\n", + "|null| null|null| null|null| null| null| null| null| null|null| null|null| null| null| null| null|null| null| null| null| null| null| null| null| null| null| null| null| null|\n", + "|null| null|null| null|null| null| null| null| null| null|null| null|null| null| null| null| null|null| null| null| null| null| null| null| null| null| null| null| null| null|\n", + "|null| null|null| null|null| null| null| null| null| null|null| null|null| null| null| null| null|null| null| null| null| null| null| null| null| null| null| null| null| null|\n", + "|null| null|null| null|null| null| null| null| null| null|null| null|null| null| null| null| null|null| null| null| null| null| null| null| null| null| null| null| null| null|\n", + "|null| null|null| null|null| null| null| null| null| null|null| null|null| null| null| null| null|null| null| null| null| null| null| null| null| null| null| null| null| null|\n", + "|null| null|null| null|null| null| null| null| null| null|null| null|null| null| null| null| null|null| null| null| null| null| null| null| null| null| null| null| null| null|\n", + "|null| null|null| null|null| null| null| null| null| null|null| null|null| null| null| null| null|null| null| null| null| null| null| null| null| null| null| null| null| null|\n", + "|null| null|null| null|null| null| null| null| null| null|null| null|null| null| null| null| null|null| null| null| null| null| null| null| null| null| null| null| null| null|\n", + "|null| null|null| null|null| null| null| null| null| null|null| null|null| null| null| null| null|null| null| null| null| null| null| null| null| null| null| null| null| null|\n", + "|null| null|null| null|null| null| null| null| null| null|null| null|null| null| null| null| null|null| null| null| null| null| null| null| null| null| null| null| null| null|\n", + "|null| null|null| null|null| null| null| null| null| null|null| null|null| null| null| null| null|null| null| null| null| null| null| null| null| null| null| null| null| null|\n", + "|null| null|null| null|null| null| null| null| null| null|null| null|null| null| null| null| null|null| null| null| null| null| null| null| null| null| null| null| null| null|\n", + "|null| null|null| null|null| null| null| null| null| null|null| null|null| null| null| null| null|null| null| null| null| null| null| null| null| null| null| null| null| null|\n", + "|null| null|null| null|null| null| null| null| null| null|null| null|null| null| null| null| null|null| null| null| null| null| null| null| null| null| null| null| null| null|\n", + "|null| null|null| null|null| null| null| null| null| null|null| null|null| null| null| null| null|null| null| null| null| null| null| null| null| null| null| null| null| null|\n", + "|null| null|null| null|null| null| null| null| null| null|null| null|null| null| null| null| null|null| null| null| null| null| null| null| null| null| null| null| null| null|\n", + "+----+-----------+----+-----+----+------------+-----------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+----------+--------+---------+--------+--------------------------+---------+---------------+-------------+-----+----------------------+----------------+------------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ], + "source": [ + "df.show()\n", + "# This comes as null which means the datatypes we gave were wrong." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 204 + }, + "colab_type": "code", + "id": "BrkVrQUsWFzk", + "outputId": "f0d8dae5-5413-4c4c-a105-b6ca5556667a" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------+-----------+-------------------+--------------------+----+--------------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+--------+---------+--------+\n", + "| ID|Case Number| Date| Block|IUCR| Primary Type| Description|Location Description|Arrest|Domestic|Beat|District|Ward|Community Area|FBI Code|X Coordinate|Y Coordinate|Year| Updated On|Latitude|Longitude|Location|\n", + "+--------+-----------+-------------------+--------------------+----+--------------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+--------+---------+--------+\n", + "|11955940| JD121140|2019-07-31 09:00:00| 026XX S HALSTED ST|1154| DECEPTIVE PRACTICE|FINANCIAL IDENTIT...| SMALL RETAIL STORE| false| false|0913| 009| 11| 60| 11| null| null|2019|01/20/2020 03:52:...| null| null| null|\n", + "|11956035| JD121288|2019-09-10 00:01:00|070XX S STONY ISL...|1582|OFFENSE INVOLVING...| CHILD PORNOGRAPHY| RESIDENCE| false| false|0332| 003| 5| 43| 17| null| null|2019|01/20/2020 03:52:...| null| null| null|\n", + "|11956045| JD121237|2019-12-15 12:00:00| 069XX S ADA ST|0610| BURGLARY| FORCIBLE ENTRY| RESIDENCE| false| false|0734| 007| 6| 67| 05| null| null|2019|01/20/2020 03:52:...| null| null| null|\n", + "|11956146| JD121344|2019-12-31 00:00:00| 016XX S THROOP ST|0610| BURGLARY| FORCIBLE ENTRY| RESIDENCE| false| false|1233| 012| 25| 31| 05| null| null|2019|01/20/2020 03:52:...| null| null| null|\n", + "|11956304| JD121233|2019-11-27 10:35:00| 002XX W GOETHE ST|0810| THEFT| OVER $500| RESIDENCE| false| false|1821| 018| 2| 8| 06| null| null|2019|01/20/2020 03:52:...| null| null| null|\n", + "+--------+-----------+-------------------+--------------------+----+--------------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+--------+---------+--------+\n", + "only showing top 5 rows\n", + "\n" + ] + } + ], + "source": [ + "# So let's just stick with infered schema for now, casting date to date type on the way\n", + "from pyspark.sql.functions import col, to_timestamp\n", + "df = spark.read.csv('reported-crimes.csv',header=True).withColumn('Date',to_timestamp(col('Date'),'MM/dd/yyyy hh:mm:ss a'))\n", + "df.show(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "rsD48rckdHPe" + }, + "source": [ + "# Wokring with columns" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "colab_type": "code", + "id": "ge9-_ygideWk", + "outputId": "9a1657dd-c7c6-4275-dffd-be200c7b8d07" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Column" + ] + }, + "execution_count": 20, + "metadata": { + "tags": [] + }, + "output_type": "execute_result" + } + ], + "source": [ + "df.Block" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "colab_type": "code", + "id": "md5zaET8dsr4", + "outputId": "202211f8-aace-4955-eb5d-1ad90baa35ab" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Column" + ] + }, + "execution_count": 21, + "metadata": { + "tags": [] + }, + "output_type": "execute_result" + } + ], + "source": [ + "df['Block']" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "YxP1su8veNde" + }, + "source": [ + "**NOTE:**\n", + "\n", + "> **We can't always use the dot notation because this will break when the column names have reserved names or attributes to the data frame class.**\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 459 + }, + "colab_type": "code", + "id": "6Gkf14sHec9a", + "outputId": "ad356e2a-01d8-4562-8de5-14216be40c32" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+\n", + "| Block|\n", + "+--------------------+\n", + "| 026XX S HALSTED ST|\n", + "|070XX S STONY ISL...|\n", + "| 069XX S ADA ST|\n", + "| 016XX S THROOP ST|\n", + "| 002XX W GOETHE ST|\n", + "|083XX S MUSKEGON AVE|\n", + "| 050XX W NELSON ST|\n", + "| 011XX W 83RD ST|\n", + "| 081XX W ADDISON ST|\n", + "| 031XX S PRAIRIE AVE|\n", + "|023XX N MILWAUKEE...|\n", + "| 0000X E CHESTNUT ST|\n", + "| 0000X N LATROBE AVE|\n", + "| 104XX S AVENUE J|\n", + "|025XX S CALIFORNI...|\n", + "| 021XX E 70TH ST|\n", + "|082XX S JEFFERY BLVD|\n", + "| 076XX S CICERO AVE|\n", + "| 016XX W LAKE ST|\n", + "| 018XX W DIVISION ST|\n", + "+--------------------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ], + "source": [ + "df.select(col('Block')).show()" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 459 + }, + "colab_type": "code", + "id": "05YQ7WXhiFcm", + "outputId": "cba63f1a-2071-4b77-f3f4-f6d5ed249f5e" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+\n", + "| Block|\n", + "+--------------------+\n", + "| 026XX S HALSTED ST|\n", + "|070XX S STONY ISL...|\n", + "| 069XX S ADA ST|\n", + "| 016XX S THROOP ST|\n", + "| 002XX W GOETHE ST|\n", + "|083XX S MUSKEGON AVE|\n", + "| 050XX W NELSON ST|\n", + "| 011XX W 83RD ST|\n", + "| 081XX W ADDISON ST|\n", + "| 031XX S PRAIRIE AVE|\n", + "|023XX N MILWAUKEE...|\n", + "| 0000X E CHESTNUT ST|\n", + "| 0000X N LATROBE AVE|\n", + "| 104XX S AVENUE J|\n", + "|025XX S CALIFORNI...|\n", + "| 021XX E 70TH ST|\n", + "|082XX S JEFFERY BLVD|\n", + "| 076XX S CICERO AVE|\n", + "| 016XX W LAKE ST|\n", + "| 018XX W DIVISION ST|\n", + "+--------------------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ], + "source": [ + "df.select(df.Block).show()" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 459 + }, + "colab_type": "code", + "id": "NNuJ3RIqe8yY", + "outputId": "5a83e809-ae09-409c-8c1e-9861e4a2399b" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+--------------------+\n", + "| Block| Description|\n", + "+--------------------+--------------------+\n", + "| 026XX S HALSTED ST|FINANCIAL IDENTIT...|\n", + "|070XX S STONY ISL...| CHILD PORNOGRAPHY|\n", + "| 069XX S ADA ST| FORCIBLE ENTRY|\n", + "| 016XX S THROOP ST| FORCIBLE ENTRY|\n", + "| 002XX W GOETHE ST| OVER $500|\n", + "|083XX S MUSKEGON AVE| $500 AND UNDER|\n", + "| 050XX W NELSON ST|HARASSMENT BY TEL...|\n", + "| 011XX W 83RD ST|SEXUAL EXPLOITATI...|\n", + "| 081XX W ADDISON ST|FINANCIAL IDENTIT...|\n", + "| 031XX S PRAIRIE AVE|ILLEGAL USE CASH ...|\n", + "|023XX N MILWAUKEE...| OVER $500|\n", + "| 0000X E CHESTNUT ST| FROM BUILDING|\n", + "| 0000X N LATROBE AVE| TO PROPERTY|\n", + "| 104XX S AVENUE J|HARASSMENT BY TEL...|\n", + "|025XX S CALIFORNI...|AGG CRIM SEX ABUS...|\n", + "| 021XX E 70TH ST|UNLAWFUL POSS OF ...|\n", + "|082XX S JEFFERY BLVD|PRO EMP HANDS NO/...|\n", + "| 076XX S CICERO AVE| $500 AND UNDER|\n", + "| 016XX W LAKE ST|DOMESTIC BATTERY ...|\n", + "| 018XX W DIVISION ST| UNLAWFUL ENTRY|\n", + "+--------------------+--------------------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ], + "source": [ + "df.select('Block','Description').show()" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 204 + }, + "colab_type": "code", + "id": "oFHUmRKZeCEV", + "outputId": "726d49bc-f384-4a31-f803-cc909151a4b2" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------+-----------+-------------------+--------------------+----+--------------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+--------+---------+--------+---+\n", + "| ID|Case Number| Date| Block|IUCR| Primary Type| Description|Location Description|Arrest|Domestic|Beat|District|Ward|Community Area|FBI Code|X Coordinate|Y Coordinate|Year| Updated On|Latitude|Longitude|Location|One|\n", + "+--------+-----------+-------------------+--------------------+----+--------------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+--------+---------+--------+---+\n", + "|11955940| JD121140|2019-07-31 09:00:00| 026XX S HALSTED ST|1154| DECEPTIVE PRACTICE|FINANCIAL IDENTIT...| SMALL RETAIL STORE| false| false|0913| 009| 11| 60| 11| null| null|2019|01/20/2020 03:52:...| null| null| null| 1|\n", + "|11956035| JD121288|2019-09-10 00:01:00|070XX S STONY ISL...|1582|OFFENSE INVOLVING...| CHILD PORNOGRAPHY| RESIDENCE| false| false|0332| 003| 5| 43| 17| null| null|2019|01/20/2020 03:52:...| null| null| null| 1|\n", + "|11956045| JD121237|2019-12-15 12:00:00| 069XX S ADA ST|0610| BURGLARY| FORCIBLE ENTRY| RESIDENCE| false| false|0734| 007| 6| 67| 05| null| null|2019|01/20/2020 03:52:...| null| null| null| 1|\n", + "|11956146| JD121344|2019-12-31 00:00:00| 016XX S THROOP ST|0610| BURGLARY| FORCIBLE ENTRY| RESIDENCE| false| false|1233| 012| 25| 31| 05| null| null|2019|01/20/2020 03:52:...| null| null| null| 1|\n", + "|11956304| JD121233|2019-11-27 10:35:00| 002XX W GOETHE ST|0810| THEFT| OVER $500| RESIDENCE| false| false|1821| 018| 2| 8| 06| null| null|2019|01/20/2020 03:52:...| null| null| null| 1|\n", + "+--------+-----------+-------------------+--------------------+----+--------------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+--------+---------+--------+---+\n", + "only showing top 5 rows\n", + "\n" + ] + } + ], + "source": [ + "#Adding a column in PySpark\n", + "# We are adding a column called 'One' at the end\n", + "from pyspark.sql.functions import lit\n", + "df = df.withColumn('One',lit(1))\n", + "df.show(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 479 + }, + "colab_type": "code", + "id": "QJqgy6lKfk2o", + "outputId": "f690fb78-50b4-4bcc-b452-ce7308d31eef" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------+-----------+-------------------+--------------------+----+--------------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------------------+----+\n", + "| ID|Case Number| Date| Block|IUCR| Primary Type| Description|Location Description|Arrest|Domestic|Beat|District|Ward|Community Area|FBI Code|X Coordinate|Y Coordinate|Year| Updated On| Latitude| Longitude| Location|Test|\n", + "+--------+-----------+-------------------+--------------------+----+--------------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------------------+----+\n", + "|11955940| JD121140|2019-07-31 09:00:00| 026XX S HALSTED ST|1154| DECEPTIVE PRACTICE|FINANCIAL IDENTIT...| SMALL RETAIL STORE| false| false|0913| 009| 11| 60| 11| null| null|2019|01/20/2020 03:52:...| null| null| null| 1|\n", + "|11956035| JD121288|2019-09-10 00:01:00|070XX S STONY ISL...|1582|OFFENSE INVOLVING...| CHILD PORNOGRAPHY| RESIDENCE| false| false|0332| 003| 5| 43| 17| null| null|2019|01/20/2020 03:52:...| null| null| null| 1|\n", + "|11956045| JD121237|2019-12-15 12:00:00| 069XX S ADA ST|0610| BURGLARY| FORCIBLE ENTRY| RESIDENCE| false| false|0734| 007| 6| 67| 05| null| null|2019|01/20/2020 03:52:...| null| null| null| 1|\n", + "|11956146| JD121344|2019-12-31 00:00:00| 016XX S THROOP ST|0610| BURGLARY| FORCIBLE ENTRY| RESIDENCE| false| false|1233| 012| 25| 31| 05| null| null|2019|01/20/2020 03:52:...| null| null| null| 1|\n", + "|11956304| JD121233|2019-11-27 10:35:00| 002XX W GOETHE ST|0810| THEFT| OVER $500| RESIDENCE| false| false|1821| 018| 2| 8| 06| null| null|2019|01/20/2020 03:52:...| null| null| null| 1|\n", + "|11956126| JD121418|2019-12-23 06:00:00|083XX S MUSKEGON AVE|0820| THEFT| $500 AND UNDER| STREET| false| false|0423| 004| 7| 46| 06| null| null|2019|01/20/2020 03:52:...| null| null| null| 1|\n", + "|11956221| JD121487|2019-09-13 17:00:00| 050XX W NELSON ST|2825| OTHER OFFENSE|HARASSMENT BY TEL...| RESIDENCE| false| false|2521| 025| 31| 19| 26| null| null|2019|01/20/2020 03:52:...| null| null| null| 1|\n", + "|11956049| JD121196|2019-12-19 19:07:00| 011XX W 83RD ST|1544| SEX OFFENSE|SEXUAL EXPLOITATI...| RESIDENCE| false| false|0613| 006| 21| 71| 17| null| null|2019|01/20/2020 03:52:...| null| null| null| 1|\n", + "|11956468| JD121650|2019-06-01 08:00:00| 081XX W ADDISON ST|1153| DECEPTIVE PRACTICE|FINANCIAL IDENTIT...| APARTMENT| false| false|1631| 016| 38| 17| 11| null| null|2019|01/20/2020 03:52:...| null| null| null| 1|\n", + "|11955888| JD121051|2019-12-01 00:01:00| 031XX S PRAIRIE AVE|1152| DECEPTIVE PRACTICE|ILLEGAL USE CASH ...| RESIDENCE| false| false|0211| 002| 4| 35| 11| null| null|2019|01/20/2020 03:52:...| null| null| null| 1|\n", + "|11956127| JD121334|2019-12-14 22:00:00|023XX N MILWAUKEE...|0810| THEFT| OVER $500| OTHER| false| false|1414| 014| 1| 22| 06| null| null|2019|01/20/2020 03:52:...| null| null| null| 1|\n", + "|11956215| JD121376|2019-12-12 19:00:00| 0000X E CHESTNUT ST|0890| THEFT| FROM BUILDING| RESTAURANT| false| false|1833| 018| 42| 8| 06| null| null|2019|01/20/2020 03:52:...| null| null| null| 1|\n", + "|11956004| JD121156|2019-11-15 12:00:00| 0000X N LATROBE AVE|1310| CRIMINAL DAMAGE| TO PROPERTY| RESIDENCE| false| false|1522| 015| 28| 25| 14| null| null|2019|01/20/2020 03:52:...| null| null| null| 1|\n", + "|11956129| JD121379|2019-11-03 15:00:00| 104XX S AVENUE J|2825| OTHER OFFENSE|HARASSMENT BY TEL...| OTHER| false| false|0432| 004| 10| 52| 26| null| null|2019|01/20/2020 03:52:...| null| null| null| 1|\n", + "|11956041| JD121266|2019-12-21 14:18:00|025XX S CALIFORNI...|1752|OFFENSE INVOLVING...|AGG CRIM SEX ABUS...| APARTMENT| false| false|1033| 010| 12| 30| 17| null| null|2019|01/20/2020 03:52:...| null| null| null| 1|\n", + "|11935337| JC563784|2019-12-29 02:25:00| 021XX E 70TH ST|143A| WEAPONS VIOLATION|UNLAWFUL POSS OF ...| ALLEY| true| false|0331| 003| 5| 43| 15| 1191863| 1858976|2019|01/20/2020 03:49:...|41.768020578|-87.572284866|(41.768020578, -8...| 1|\n", + "|11927831| JC553936|2019-12-20 13:15:00|082XX S JEFFERY BLVD|0545| ASSAULT|PRO EMP HANDS NO/...| STREET| true| false|0414| 004| 8| 46| 08A| 1190954| 1850799|2019|01/20/2020 03:49:...|41.745604221|-87.575880606|(41.745604221, -8...| 1|\n", + "|11927034| JC549451|2019-12-16 19:15:00| 076XX S CICERO AVE|0820| THEFT| $500 AND UNDER| SMALL RETAIL STORE| false| false|0833| 008| 18| 65| 06| 1145727| 1853720|2019|01/20/2020 03:49:...|41.754592961|-87.741528537|(41.754592961, -8...| 1|\n", + "|11926969| JC553305|2019-12-19 21:30:00| 016XX W LAKE ST|0486| BATTERY|DOMESTIC BATTERY ...| RESIDENCE| true| false|1224| 012| 27| 28| 08B| 1165379| 1901492|2019|01/20/2020 03:49:...|41.885291047| -87.66815409|(41.885291047, -8...| 1|\n", + "|11926797| JC553193|2019-12-19 19:04:00| 018XX W DIVISION ST|0620| BURGLARY| UNLAWFUL ENTRY| RESIDENCE| true| false|1212| 012| 1| 24| 05| 1164005| 1908028|2019|01/20/2020 03:49:...|41.903255445|-87.673014935|(41.903255445, -8...| 1|\n", + "+--------+-----------+-------------------+--------------------+----+--------------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------------------+----+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ], + "source": [ + "#Renaming a column in PySpark\n", + "df = df.withColumnRenamed('One', 'Test')\n", + "df.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 119 + }, + "colab_type": "code", + "id": "M1ek2opVfqea", + "outputId": "49be0adb-10a4-432e-e149-7fbc6cc66f9f" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+----+------+\n", + "|Year| count|\n", + "+----+------+\n", + "|2019|257625|\n", + "+----+------+\n", + "\n" + ] + } + ], + "source": [ + "#Group By a column in PySpark\n", + "df.groupBy('Year').count().show(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 204 + }, + "colab_type": "code", + "id": "xsb9PXxpfnmh", + "outputId": "60689561-7efc-45a0-b8ae-819e7f771c8b" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------+-----------+-------------------+--------------------+----+--------------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+--------+---------+--------+\n", + "| ID|Case Number| Date| Block|IUCR| Primary Type| Description|Location Description|Arrest|Domestic|Beat|District|Ward|Community Area|FBI Code|X Coordinate|Y Coordinate|Year| Updated On|Latitude|Longitude|Location|\n", + "+--------+-----------+-------------------+--------------------+----+--------------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+--------+---------+--------+\n", + "|11955940| JD121140|2019-07-31 09:00:00| 026XX S HALSTED ST|1154| DECEPTIVE PRACTICE|FINANCIAL IDENTIT...| SMALL RETAIL STORE| false| false|0913| 009| 11| 60| 11| null| null|2019|01/20/2020 03:52:...| null| null| null|\n", + "|11956035| JD121288|2019-09-10 00:01:00|070XX S STONY ISL...|1582|OFFENSE INVOLVING...| CHILD PORNOGRAPHY| RESIDENCE| false| false|0332| 003| 5| 43| 17| null| null|2019|01/20/2020 03:52:...| null| null| null|\n", + "|11956045| JD121237|2019-12-15 12:00:00| 069XX S ADA ST|0610| BURGLARY| FORCIBLE ENTRY| RESIDENCE| false| false|0734| 007| 6| 67| 05| null| null|2019|01/20/2020 03:52:...| null| null| null|\n", + "|11956146| JD121344|2019-12-31 00:00:00| 016XX S THROOP ST|0610| BURGLARY| FORCIBLE ENTRY| RESIDENCE| false| false|1233| 012| 25| 31| 05| null| null|2019|01/20/2020 03:52:...| null| null| null|\n", + "|11956304| JD121233|2019-11-27 10:35:00| 002XX W GOETHE ST|0810| THEFT| OVER $500| RESIDENCE| false| false|1821| 018| 2| 8| 06| null| null|2019|01/20/2020 03:52:...| null| null| null|\n", + "+--------+-----------+-------------------+--------------------+----+--------------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+--------+---------+--------+\n", + "only showing top 5 rows\n", + "\n" + ] + } + ], + "source": [ + "#Remove columns in PySpark\n", + "df = df.drop('Test')\n", + "df.show(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "WbKK5iHwmIoV" + }, + "source": [ + "## Wokring with Rows" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 479 + }, + "colab_type": "code", + "id": "YNfcjOIknA3n", + "outputId": "585cbc1a-6381-4102-d39b-f6399a87700d" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------+-----------+-------------------+--------------------+----+-------------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------------------+\n", + "| ID|Case Number| Date| Block|IUCR| Primary Type| Description|Location Description|Arrest|Domestic|Beat|District|Ward|Community Area|FBI Code|X Coordinate|Y Coordinate|Year| Updated On| Latitude| Longitude| Location|\n", + "+--------+-----------+-------------------+--------------------+----+-------------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------------------+\n", + "|11652463| JC221068|2019-04-11 22:30:00| 012XX W GUNNISON ST|0325| ROBBERY| VEHICULAR HIJACKING| STREET| true| false|2033| 020| 46| 3| 03| 1167294| 1932462|2019|01/20/2020 03:49:...|41.970233435|-87.660229117|(41.970233435, -8...|\n", + "|11580399| JC126732|2019-01-23 13:05:00| 033XX W FILLMORE ST|2093| NARCOTICS|FOUND SUSPECT NAR...|POLICE FACILITY/V...| true| false|1134| 011| 24| 29| 18| 1154228| 1895173|2019|01/19/2020 03:47:...|41.868180939|-87.709271389|(41.868180939, -8...|\n", + "|11953660| JD118414|2019-03-01 14:10:00| 013XX E 62ND ST|1153| DECEPTIVE PRACTICE|FINANCIAL IDENTIT...| null| false| false|0314| 003| 20| 42| 11| 1185906| 1864152|2019|01/19/2020 03:47:...|41.782366539| -87.59395659|(41.782366539, -8...|\n", + "|11955048| JD120087|2019-01-29 12:00:00| 046XX N PAULINA ST|0890| THEFT| FROM BUILDING| OTHER| false| false|1912| 019| 47| 3| 06| null| null|2019|01/19/2020 03:49:...| null| null| null|\n", + "|11953654| JD118405|2019-03-01 12:50:00| 008XX E 38TH PL|1153| DECEPTIVE PRACTICE|FINANCIAL IDENTIT...| null| false| false|0212| 002| 4| 36| 11| 1182247| 1879773|2019|01/19/2020 03:47:...| 41.82531739|-87.606887309|(41.82531739, -87...|\n", + "|11955104| JD120139|2019-05-15 16:00:00| 035XX W 71ST PL|1563| SEX OFFENSE|CRIMINAL SEXUAL A...| RESIDENCE| false| true|0831| 008| 17| 66| 17| null| null|2019|01/19/2020 03:49:...| null| null| null|\n", + "|11581568| JC126573|2019-01-23 11:00:00| 033XX W FILLMORE ST|2093| NARCOTICS|FOUND SUSPECT NAR...|POLICE FACILITY/V...| true| false|1134| 011| 24| 29| 18| 1154228| 1895173|2019|01/19/2020 03:47:...|41.868180939|-87.709271389|(41.868180939, -8...|\n", + "|11953335| JD117914|2019-01-10 21:30:00| 004XX N MC CLURG CT|1330| CRIMINAL TRESPASS| TO LAND| RESIDENCE-GARAGE| false| false|1834| 018| 42| 8| 26| 1179120| 1903342|2019|01/19/2020 03:47:...|41.890064174|-87.617638563|(41.890064174, -8...|\n", + "|11946410| JC151544|2019-02-13 20:49:00| 043XX S TROY ST|2024| NARCOTICS| POSS: HEROIN(WHITE)| RESIDENCE| true| false|0922| 009| 15| 58| 18| 1156071| 1875567|2019|01/18/2020 03:44:...|41.814342797|-87.703033557|(41.814342797, -8...|\n", + "|11677953| JC235447|2019-04-23 14:46:00|040XX W VAN BUREN ST|2014| NARCOTICS|MANU/DELIVER: HER...| VACANT LOT/LAND| true| false|1132| 011| 28| 26| 18| 1149531| 1897718|2019|01/18/2020 03:44:...|41.875257119| -87.72644908|(41.875257119, -8...|\n", + "|11672673| JC231408|2019-04-20 09:42:00| 085XX S ABERDEEN ST|1812| NARCOTICS|POSS: CANNABIS MO...| RESIDENCE| true| false|0613| 006| 21| 71| 18| 1170476| 1848176|2019|01/18/2020 03:44:...| 41.73887651|-87.650991641|(41.73887651, -87...|\n", + "|11672370| JC232204|2019-04-20 22:45:00| 011XX W 90TH ST|1812| NARCOTICS|POSS: CANNABIS MO...| APARTMENT| true| false|2222| 022| 21| 73| 18| 1170443| 1845120|2019|01/18/2020 03:44:...|41.730491134|-87.651201287|(41.730491134, -8...|\n", + "|11671344| JC235165|2019-04-23 13:17:00| 053XX W CHICAGO AVE|2017| NARCOTICS| MANU/DELIVER:CRACK| ALLEY| true| false|1524| 015| 37| 25| 18| 1140417| 1904813|2019|01/18/2020 03:44:...|41.894898651| -87.75973838|(41.894898651, -8...|\n", + "|11666457| JC228439|2019-04-17 18:34:14|054XX S CHRISTIAN...|1811| NARCOTICS|POSS: CANNABIS 30...| RESIDENCE| true| false|0822| 008| 14| 63| 18| 1154972| 1868333|2019|01/18/2020 03:44:...|41.794513753| -87.70725813|(41.794513753, -8...|\n", + "|11951328| JD115421|2019-05-01 09:00:00| 001XX E ONTARIO ST|1153| DECEPTIVE PRACTICE|FINANCIAL IDENTIT...|COMMERCIAL / BUSI...| false| false|1834| 018| 42| 8| 11| 1177462| 1904523|2019|01/17/2020 03:45:...|41.893342683| -87.62369154|(41.893342683, -8...|\n", + "|11953844| JD118256|2019-02-16 12:00:00| 062XX S KOLIN AVE|0281|CRIM SEXUAL ASSAULT| NON-AGGRAVATED| APARTMENT| false| true|0813| 008| 23| 65| 02| null| null|2019|01/17/2020 03:48:...| null| null| null|\n", + "|11950821| JC550653|2019-04-04 08:00:00| 079XX S KARLOV AVE|1154| DECEPTIVE PRACTICE|FINANCIAL IDENTIT...| RESIDENCE| false| false|0834| 008| 18| 70| 11| 1150460| 1851422|2019|01/17/2020 03:45:...|41.748196202|-87.724242888|(41.748196202, -8...|\n", + "|11951415| JD115614|2019-01-13 11:00:00|115XX S WENTWORTH...|1153| DECEPTIVE PRACTICE|FINANCIAL IDENTIT...| APARTMENT| false| false|0522| 005| 34| 53| 11| 1177005| 1828396|2019|01/17/2020 03:45:...|41.684452919|-87.627664687|(41.684452919, -8...|\n", + "|11951493| JD115459|2019-05-14 22:00:00|033XX W ARTHINGTO...|1563| SEX OFFENSE|CRIMINAL SEXUAL A...| RESIDENCE| false| true|1134| 011| 24| 29| 17| 1154345| 1895847|2019|01/17/2020 03:45:...|41.870028132|-87.708823855|(41.870028132, -8...|\n", + "|11951147| JD115168|2019-04-16 00:01:00| 005XX W MONROE ST|1120| DECEPTIVE PRACTICE| FORGERY| BANK| false| false|0121| 001| 42| 28| 10| 1172908| 1899834|2019|01/17/2020 03:45:...|41.880577917|-87.640555606|(41.880577917, -8...|\n", + "+--------+-----------+-------------------+--------------------+----+-------------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------------------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ], + "source": [ + "# Filtering rows in PySpark\n", + "df.filter(col('Date')<'2019-06-01').show()" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 119 + }, + "colab_type": "code", + "id": "B1RKg1UrmBQz", + "outputId": "9653f918-dee0-47d6-a6f8-7219c51adab6" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+----+\n", + "|Year|\n", + "+----+\n", + "|2019|\n", + "+----+\n", + "\n" + ] + } + ], + "source": [ + "#Get Unique Rows in PySpark\n", + "df.select('Year').distinct().show()" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 479 + }, + "colab_type": "code", + "id": "4ZpeJvz0nkBI", + "outputId": "51c20b7e-67fb-49f2-c81f-955579def74d" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------+-----------+-------------------+--------------------+----+--------------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------------------+\n", + "| ID|Case Number| Date| Block|IUCR| Primary Type| Description|Location Description|Arrest|Domestic|Beat|District|Ward|Community Area|FBI Code|X Coordinate|Y Coordinate|Year| Updated On| Latitude| Longitude| Location|\n", + "+--------+-----------+-------------------+--------------------+----+--------------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------------------+\n", + "|11895528| JC515003|2019-01-01 00:00:00| 059XX W LELAND AVE|1153| DECEPTIVE PRACTICE|FINANCIAL IDENTIT...| null| false| false|1622| 016| 45| 15| 11| null| null|2019|11/19/2019 03:57:...| null| null| null|\n", + "|11940493| JD102895|2019-01-01 00:00:00|012XX S KOMENSKY AVE|141A| WEAPONS VIOLATION|UNLAWFUL USE HANDGUN|RESIDENCE PORCH/H...| false| false|1011| 010| 24| 29| 15| 1149582| 1893991|2019|01/08/2020 03:47:...|41.865028808|-87.726358587|(41.865028808, -8...|\n", + "|11937662| JC566620|2019-01-01 00:00:00| 019XX N HAMLIN AVE|1582|OFFENSE INVOLVING...| CHILD PORNOGRAPHY|SCHOOL, PRIVATE, ...| false| false|2535| 025| 26| 22| 17| 1150737| 1912590|2019|01/03/2020 03:56:...|41.916043916|-87.721631784|(41.916043916, -8...|\n", + "|11739188| JC326320|2019-01-01 00:00:00|010XX N RIDGEWAY AVE|1752|OFFENSE INVOLVING...|AGG CRIM SEX ABUS...| APARTMENT| false| true|1112| 011| 27| 23| 17| 1151192| 1906670|2019|12/18/2019 03:47:...|41.899789956|-87.720115618|(41.899789956, -8...|\n", + "|11911175| JC534087|2019-01-01 00:00:00| 002XX E 121ST ST|1153| DECEPTIVE PRACTICE|FINANCIAL IDENTIT...| RESIDENCE| false| false|0532| 005| 9| 53| 11| 1180018| 1824734|2019|12/07/2019 03:47:...|41.674335599|-87.616746527|(41.674335599, -8...|\n", + "|11906205| JC528532|2019-01-01 00:00:00|069XX S WOODLAWN AVE|1754|OFFENSE INVOLVING...|AGG SEX ASSLT OF ...| RESIDENCE| false| true|0321| 003| 5| 69| 02| 1185599| 1859302|2019|12/04/2019 03:49:...|41.769064952|-87.595234737|(41.769064952, -8...|\n", + "|11954981| JD119994|2019-01-01 00:00:00| 073XX W FARWELL AVE|1750|OFFENSE INVOLVING...| CHILD ABUSE| RESIDENCE| false| true|1611| 016| 41| 9| 08B| null| null|2019|01/19/2020 03:49:...| null| null| null|\n", + "|11889579| JC507635|2019-01-01 00:00:00|082XX S WENTWORTH...|1562| SEX OFFENSE|AGG CRIMINAL SEXU...| RESIDENCE| false| false|0622| 006| 21| 44| 17| 1176419| 1850436|2019|11/15/2019 03:57:...|41.744946864|-87.629149986|(41.744946864, -8...|\n", + "|11878985| JC494834|2019-01-01 00:00:00| 067XX W GRAND AVE|0810| THEFT| OVER $500| OTHER| false| false|2512| 025| 36| 18| 06| 1131156| 1915217|2019|11/06/2019 03:54:...|41.923613437|-87.793511665|(41.923613437, -8...|\n", + "|11560259| JC109129|2019-01-01 00:00:00| 049XX N ALBANY AVE|1750|OFFENSE INVOLVING...| CHILD ABUSE| APARTMENT| false| false|1713| 017| 33| 14| 08B| 1154815| 1932723|2019|10/30/2019 03:53:...| 41.9712095|-87.706108126|(41.9712095, -87....|\n", + "|11861735| JC473834|2019-01-01 00:00:00| 108XX S SANGAMON ST|1752|OFFENSE INVOLVING...|AGG CRIM SEX ABUS...| RESIDENCE| false| true|2234| 022| 34| 75| 17| 1171912| 1832941|2019|10/18/2019 04:02:...|41.697038087|-87.646175932|(41.697038087, -8...|\n", + "|11752917| JC342582|2019-01-01 00:00:00| 028XX E 76TH ST|1752|OFFENSE INVOLVING...|AGG CRIM SEX ABUS...| APARTMENT| false| true|0421| 004| 7| 43| 17| 1196262| 1855497|2019|10/15/2019 04:01:...| 41.75836602|-87.556276032|(41.75836602, -87...|\n", + "|11838300| JC441914|2019-01-01 00:00:00|031XX W ARTHINGTO...|1752|OFFENSE INVOLVING...|AGG CRIM SEX ABUS...| RESIDENCE| false| true|1134| 011| 24| 27| 17| 1155548| 1895868|2019|09/26/2019 04:16:...|41.870061657|-87.704406691|(41.870061657, -8...|\n", + "|11752915| JC342546|2019-01-01 00:00:00|063XX S STONY ISL...|1752|OFFENSE INVOLVING...|AGG CRIM SEX ABUS...| APARTMENT| false| true|0314| 003| 20| 42| 17| 1187962| 1863227|2019|09/25/2019 03:50:...|41.779779505|-87.586448286|(41.779779505, -8...|\n", + "|11739161| JC326172|2019-01-01 00:00:00| 131XX S LANGLEY AVE|1752|OFFENSE INVOLVING...|AGG CRIM SEX ABUS...| APARTMENT| false| true|0533| 005| 9| 54| 17| 1183247| 1818205|2019|09/20/2019 03:53:...| 41.65634477|-87.605129962|(41.65634477, -87...|\n", + "|11739167| JC326145|2019-01-01 00:00:00| 024XX W CARMEN AVE|1752|OFFENSE INVOLVING...|AGG CRIM SEX ABUS...| APARTMENT| false| true|2031| 020| 40| 4| 17| 1159097| 1933800|2019|09/13/2019 04:06:...|41.974077735|-87.690332986|(41.974077735, -8...|\n", + "|11808448| JC409430|2019-01-01 00:00:00|016XX E HYDE PARK...|1153| DECEPTIVE PRACTICE|FINANCIAL IDENTIT...| OTHER| false| false|0222| 002| 5| 39| 11| 1188001| 1871507|2019|08/30/2019 03:57:...|41.802499513|-87.586041451|(41.802499513, -8...|\n", + "|11803895| JC404008|2019-01-01 00:00:00| 070XX S CALUMET AVE|1153| DECEPTIVE PRACTICE|FINANCIAL IDENTIT...| RESIDENCE| false| false|0322| 003| 6| 69| 11| 1179694| 1858501|2019|08/28/2019 04:08:...|41.767003943|-87.616903894|(41.767003943, -8...|\n", + "|11785377| JC381800|2019-01-01 00:00:00| 011XX N KOSTNER AVE|1310| CRIMINAL DAMAGE| TO PROPERTY| APARTMENT| false| false|1111| 011| 37| 23| 14| 1146837| 1907072|2019|08/09/2019 04:09:...|41.900977393|-87.736101441|(41.900977393, -8...|\n", + "|11777719| JC372752|2019-01-01 00:00:00| 036XX S HALSTED ST|1120| DECEPTIVE PRACTICE| FORGERY| RESIDENCE| false| false|0915| 009| 11| 60| 10| 1171564| 1880789|2019|08/03/2019 04:02:...|41.828346576|-87.646050351|(41.828346576, -8...|\n", + "+--------+-----------+-------------------+--------------------+----+--------------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------------------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ], + "source": [ + "# Sort Rows in PySpark\n", + "df.orderBy('Date').show()" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "colab_type": "code", + "id": "bCZIzfYmnx--", + "outputId": "06e38cb1-cccc-44a5-f005-0a485b98a7fe" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "10" + ] + }, + "execution_count": 33, + "metadata": { + "tags": [] + }, + "output_type": "execute_result" + } + ], + "source": [ + "# Append rows in PySpark.\n", + "one_day = spark.read.csv('reported-crimes.csv',header=True).withColumn('Date',to_timestamp(col('Date'),'MM/dd/yyyy hh:mm:ss a')).filter(col('Date')==lit('2019-07-30'))\n", + "df.filter(col('Date')==lit('2019-07-30')).count()" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "colab_type": "code", + "id": "k9CZlZ4Got_e", + "outputId": "db825a82-8e9a-4ea1-eff7-2f370676d6b5" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "20" + ] + }, + "execution_count": 34, + "metadata": { + "tags": [] + }, + "output_type": "execute_result" + } + ], + "source": [ + "df.union(one_day).filter(col('Date')==lit('2019-07-30')).count()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "1pfPzVOFqC_8" + }, + "source": [ + "**Result:**\n", + "\n", + "> As you can see here, there were 6 crimes commited on 2019-07-30, and after union, there's 12 records.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 289 + }, + "colab_type": "code", + "id": "6tVJeAY_plob", + "outputId": "bd7b6a38-55f1-4e21-897b-04fb1a2f4661" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-------------------+-----+\n", + "| Primary Type|count|\n", + "+-------------------+-----+\n", + "| THEFT|62137|\n", + "| BATTERY|49425|\n", + "| CRIMINAL DAMAGE|26638|\n", + "| ASSAULT|20584|\n", + "| DECEPTIVE PRACTICE|17674|\n", + "| OTHER OFFENSE|16543|\n", + "| NARCOTICS|13946|\n", + "| BURGLARY| 9590|\n", + "|MOTOR VEHICLE THEFT| 8968|\n", + "| ROBBERY| 7985|\n", + "+-------------------+-----+\n", + "only showing top 10 rows\n", + "\n" + ] + } + ], + "source": [ + "# Top 10 number of reported crimes by Primary Type, in descending order of Occurence\n", + "df.groupBy(\"Primary Type\").count().orderBy('count',ascending=False).show(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "xOQPOt19q_he" + }, + "source": [ + "# Hands-on Question 🤚 !" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "uk7x_PCarIff" + }, + "source": [ + "**What percentage of reported crimes resulted in an arrest?**" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "zZkkAz-srcMf" + }, + "outputs": [], + "source": [ + "# Answer" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "9Qzp-Vb-ra6w" + }, + "source": [ + "**What are the top 3 locations for reported crimes?**" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "qI6srWS4rYwv" + }, + "outputs": [], + "source": [ + "# Answer" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "aHjILb1DriuX" + }, + "source": [ + "# Functions" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 54 + }, + "colab_type": "code", + "id": "x3vlC7ZerlKb", + "outputId": "05314019-519c-4fb9-c27a-dec7e1a5251a" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['Column', 'DataFrame', 'DataType', 'PandasUDFType', 'PythonEvalType', 'SparkContext', 'StringType', 'UserDefinedFunction', '__all__', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', '_binary_mathfunctions', '_collect_list_doc', '_collect_set_doc', '_create_binary_mathfunction', '_create_column_from_literal', '_create_function', '_create_udf', '_create_window_function', '_functions', '_functions_1_4', '_functions_1_6', '_functions_2_1', '_functions_2_4', '_functions_deprecated', '_lit_doc', '_message', '_string_functions', '_test', '_to_java_column', '_to_seq', '_window_functions', '_wrap_deprecated_function', 'abs', 'acos', 'add_months', 'approxCountDistinct', 'approx_count_distinct', 'array', 'array_contains', 'array_distinct', 'array_except', 'array_intersect', 'array_join', 'array_max', 'array_min', 'array_position', 'array_remove', 'array_repeat', 'array_sort', 'array_union', 'arrays_overlap', 'arrays_zip', 'asc', 'asc_nulls_first', 'asc_nulls_last', 'ascii', 'asin', 'atan', 'atan2', 'avg', 'base64', 'basestring', 'bin', 'bitwiseNOT', 'blacklist', 'broadcast', 'bround', 'cbrt', 'ceil', 'coalesce', 'col', 'collect_list', 'collect_set', 'column', 'concat', 'concat_ws', 'conv', 'corr', 'cos', 'cosh', 'count', 'countDistinct', 'covar_pop', 'covar_samp', 'crc32', 'create_map', 'cume_dist', 'current_date', 'current_timestamp', 'date_add', 'date_format', 'date_sub', 'date_trunc', 'datediff', 'dayofmonth', 'dayofweek', 'dayofyear', 'decode', 'degrees', 'dense_rank', 'desc', 'desc_nulls_first', 'desc_nulls_last', 'element_at', 'encode', 'exp', 'explode', 'explode_outer', 'expm1', 'expr', 'factorial', 'first', 'flatten', 'floor', 'format_number', 'format_string', 'from_json', 'from_unixtime', 'from_utc_timestamp', 'functools', 'get_json_object', 'greatest', 'grouping', 'grouping_id', 'hash', 'hex', 'hour', 'hypot', 'ignore_unicode_prefix', 'initcap', 'input_file_name', 'instr', 'isnan', 'isnull', 'json_tuple', 'kurtosis', 'lag', 'last', 'last_day', 'lead', 'least', 'length', 'levenshtein', 'lit', 'locate', 'log', 'log10', 'log1p', 'log2', 'lower', 'lpad', 'ltrim', 'map_concat', 'map_from_arrays', 'map_from_entries', 'map_keys', 'map_values', 'max', 'md5', 'mean', 'min', 'minute', 'monotonically_increasing_id', 'month', 'months_between', 'nanvl', 'next_day', 'ntile', 'pandas_udf', 'percent_rank', 'posexplode', 'posexplode_outer', 'pow', 'quarter', 'radians', 'rand', 'randn', 'rank', 'regexp_extract', 'regexp_replace', 'repeat', 'reverse', 'rint', 'round', 'row_number', 'rpad', 'rtrim', 'schema_of_json', 'second', 'sequence', 'sha1', 'sha2', 'shiftLeft', 'shiftRight', 'shiftRightUnsigned', 'shuffle', 'signum', 'sin', 'since', 'sinh', 'size', 'skewness', 'slice', 'sort_array', 'soundex', 'spark_partition_id', 'split', 'sqrt', 'stddev', 'stddev_pop', 'stddev_samp', 'struct', 'substring', 'substring_index', 'sum', 'sumDistinct', 'sys', 'tan', 'tanh', 'toDegrees', 'toRadians', 'to_date', 'to_json', 'to_timestamp', 'to_utc_timestamp', 'translate', 'trim', 'trunc', 'udf', 'unbase64', 'unhex', 'unix_timestamp', 'upper', 'var_pop', 'var_samp', 'variance', 'warnings', 'weekofyear', 'when', 'window', 'year']\n" + ] + } + ], + "source": [ + "# Functions available in PySpark\n", + "from pyspark.sql import functions\n", + "print(dir(functions))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "PIKigra7A34e" + }, + "source": [ + "## String Functions" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "63QDccSjBqC4" + }, + "outputs": [], + "source": [ + "# Loading the data\n", + "from pyspark.sql.functions import col\n", + "df = spark.read.csv('reported-crimes.csv',header=True).withColumn('Date',to_timestamp(col('Date'),'MM/dd/yyyy hh:mm:ss a'))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "LiXWN8DUA9x6" + }, + "source": [ + "**Display the Primary Type column in lower and upper characters, and the first 4 characters of the column**" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 459 + }, + "colab_type": "code", + "id": "52Gh9c99BZFr", + "outputId": "88e24118-eb24-41e7-8856-bcf914264e63" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Help on function substring in module pyspark.sql.functions:\n", + "\n", + "substring(str, pos, len)\n", + " Substring starts at `pos` and is of length `len` when str is String type or\n", + " returns the slice of byte array that starts at `pos` in byte and is of length `len`\n", + " when str is Binary type.\n", + " \n", + " .. note:: The position is not zero based, but 1 based index.\n", + " \n", + " >>> df = spark.createDataFrame([('abcd',)], ['s',])\n", + " >>> df.select(substring(df.s, 1, 2).alias('s')).collect()\n", + " [Row(s='ab')]\n", + " \n", + " .. versionadded:: 1.5\n", + "\n", + "+--------------------+--------------------+-----------------------------+\n", + "| lower(Primary Type)| upper(Primary Type)|substring(Primary Type, 1, 4)|\n", + "+--------------------+--------------------+-----------------------------+\n", + "| deceptive practice| DECEPTIVE PRACTICE| DECE|\n", + "|offense involving...|OFFENSE INVOLVING...| OFFE|\n", + "| burglary| BURGLARY| BURG|\n", + "| burglary| BURGLARY| BURG|\n", + "| theft| THEFT| THEF|\n", + "+--------------------+--------------------+-----------------------------+\n", + "only showing top 5 rows\n", + "\n" + ] + } + ], + "source": [ + "from pyspark.sql.functions import col,lower, upper, substring\n", + "help(substring)\n", + "df.select(lower(col('Primary Type')),upper(col('Primary Type')),substring(col('Primary Type'),1,4)).show(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "ldtA0wk9BMkT" + }, + "source": [ + "## Numeric functions" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "kmz4G5LVBOs6" + }, + "source": [ + "**Show the oldest date and the most recent date**" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 119 + }, + "colab_type": "code", + "id": "wBDDH-YpBbdk", + "outputId": "988c8eb7-a03b-4926-f4d1-2d4e60b13e28" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-------------------+-------------------+\n", + "| min(Date)| max(Date)|\n", + "+-------------------+-------------------+\n", + "|2019-01-01 00:00:00|2019-12-31 23:55:00|\n", + "+-------------------+-------------------+\n", + "\n" + ] + } + ], + "source": [ + "from pyspark.sql.functions import min, max\n", + "df.select(min(col('Date')), max(col('Date'))).show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "KQ6Ul9HGCwC3" + }, + "source": [ + "## Date" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "9WFrADdkBPsX" + }, + "source": [ + "**What is 3 days earlier that the oldest date and 3 days later than the most recent date?**" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 119 + }, + "colab_type": "code", + "id": "YrNmD5umAx__", + "outputId": "e0ff823f-cdc1-4f2b-933e-f716b23d1a74" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+----------------------+----------------------+\n", + "|date_add(max(Date), 3)|date_sub(min(Date), 3)|\n", + "+----------------------+----------------------+\n", + "| 2020-01-03| 2018-12-29|\n", + "+----------------------+----------------------+\n", + "\n" + ] + } + ], + "source": [ + "from pyspark.sql.functions import date_add, date_sub\n", + "df.select(date_add(max(col('Date')),3), date_sub(min(col('Date')),3)).show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "sY6PstyLDp6P" + }, + "source": [ + "# Working with Dates" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "s1jmBN2qFHyk" + }, + "source": [ + "> [PySpark follows SimpleDateFormat table of Java. Click here to view the docs.](https://docs.oracle.com/javase/7/docs/api/java/text/SimpleDateFormat.html)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 119 + }, + "colab_type": "code", + "id": "sCTeI_JvDCsH", + "outputId": "786f1816-5e7e-4745-a46d-103be8a80294" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-------------------+\n", + "| Christmas|\n", + "+-------------------+\n", + "|2019-12-25 13:30:00|\n", + "+-------------------+\n", + "\n" + ] + } + ], + "source": [ + "from pyspark.sql.functions import to_date, to_timestamp, lit\n", + "df = spark.createDataFrame([('2019-12-25 13:30:00',)], ['Christmas'])\n", + "df.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 119 + }, + "colab_type": "code", + "id": "ZH8ja1eHEW8x", + "outputId": "4947fd01-5043-40c4-fcef-c2335af911ea" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-------------------------------------------+------------------------------------------------+\n", + "|to_date(`Christmas`, 'yyyy-MM-dd HH:mm:ss')|to_timestamp(`Christmas`, 'yyyy-MM-dd HH:mm:ss')|\n", + "+-------------------------------------------+------------------------------------------------+\n", + "| 2019-12-25| 2019-12-25 13:30:00|\n", + "+-------------------------------------------+------------------------------------------------+\n", + "\n" + ] + } + ], + "source": [ + "df.select(to_date(col('Christmas'),'yyyy-MM-dd HH:mm:ss'), to_timestamp(col('Christmas'),'yyyy-MM-dd HH:mm:ss')).show()" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 119 + }, + "colab_type": "code", + "id": "7g9m_8PPErI1", + "outputId": "941eed2f-7002-4311-cc71-61913e8767ba" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------------------------------+-------------------------------------------------+\n", + "|to_date(`Christmas`, 'dd/MMM/yyyy HH:mm:ss')|to_timestamp(`Christmas`, 'dd/MMM/yyyy HH:mm:ss')|\n", + "+--------------------------------------------+-------------------------------------------------+\n", + "| 2019-12-25| 2019-12-25 13:30:00|\n", + "+--------------------------------------------+-------------------------------------------------+\n", + "\n" + ] + } + ], + "source": [ + "df = spark.createDataFrame([('25/Dec/2019 13:30:00',)], ['Christmas'])\n", + "df.select(to_date(col('Christmas'),'dd/MMM/yyyy HH:mm:ss'), to_timestamp(col('Christmas'),'dd/MMM/yyyy HH:mm:ss')).show()" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 323 + }, + "colab_type": "code", + "id": "x26Ls5SzE9qJ", + "outputId": "5f499b28-b27e-4de2-ac61-df11fabd5aeb" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+\n", + "| Christmas|\n", + "+--------------------+\n", + "|12/25/2019 01:30:...|\n", + "+--------------------+\n", + "\n", + "+----------------------+\n", + "|Christmas |\n", + "+----------------------+\n", + "|12/25/2019 01:30:00 PM|\n", + "+----------------------+\n", + "\n", + "+----------------------------------------------+---------------------------------------------------+\n", + "|to_date(`Christmas`, 'MM/dd/yyyy hh:mm:ss aa')|to_timestamp(`Christmas`, 'MM/dd/yyyy hh:mm:ss aa')|\n", + "+----------------------------------------------+---------------------------------------------------+\n", + "| 2019-12-25| 2019-12-25 13:30:00|\n", + "+----------------------------------------------+---------------------------------------------------+\n", + "\n" + ] + } + ], + "source": [ + "df = spark.createDataFrame([('12/25/2019 01:30:00 PM',)], ['Christmas'])\n", + "df.show(1)\n", + "df.show(1, truncate = False)\n", + "df.select(to_date(col('Christmas'),'MM/dd/yyyy hh:mm:ss aa'), to_timestamp(col('Christmas'),'MM/dd/yyyy hh:mm:ss aa')).show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "7OZElEvcGOD1" + }, + "source": [ + "# Working with joins" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "UJBC7r3JFyCL" + }, + "outputs": [], + "source": [ + "# Loading the data\n", + "from pyspark.sql.functions import col\n", + "df = spark.read.csv('reported-crimes.csv',header=True).withColumn('Date',to_timestamp(col('Date'),'MM/dd/yyyy hh:mm:ss a'))" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 204 + }, + "colab_type": "code", + "id": "-_aNpIqwIERe", + "outputId": "7e26e2fe-fabe-441c-c09c-0f14e64377d9" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2020-01-22 04:30:02-- https://data.cityofchicago.org/api/views/z8bn-74gv/rows.csv?accessType=DOWNLOAD\n", + "Resolving data.cityofchicago.org (data.cityofchicago.org)... 52.206.140.205, 52.206.68.26, 52.206.140.199\n", + "Connecting to data.cityofchicago.org (data.cityofchicago.org)|52.206.140.205|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: unspecified [text/csv]\n", + "Saving to: ‘police-station.csv’\n", + "\n", + "\r", + "police-station.csv [<=> ] 0 --.-KB/s \r", + "police-station.csv [ <=> ] 5.57K --.-KB/s in 0s \n", + "\n", + "2020-01-22 04:30:03 (562 MB/s) - ‘police-station.csv’ saved [5699]\n", + "\n" + ] + } + ], + "source": [ + "# Dowloading police station data\n", + "!wget -O police-station.csv https://data.cityofchicago.org/api/views/z8bn-74gv/rows.csv?accessType=DOWNLOAD" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 459 + }, + "colab_type": "code", + "id": "HOQjNSrdIO1n", + "outputId": "d7e8ad6f-750a-4ca9-e0d2-ea624a19b3f2" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+-----------------+--------------------+-------+-----+-----+--------------------+------------+------------+------------+------------+------------+-----------+------------+--------------------+\n", + "| DISTRICT| DISTRICT NAME| ADDRESS| CITY|STATE| ZIP| WEBSITE| PHONE| FAX| TTY|X COORDINATE|Y COORDINATE| LATITUDE| LONGITUDE| LOCATION|\n", + "+--------------------+-----------------+--------------------+-------+-----+-----+--------------------+------------+------------+------------+------------+------------+-----------+------------+--------------------+\n", + "| 1| Central| 1718 S State St|Chicago| IL|60616|http://home.chica...|312-745-4290|312-745-3694|312-745-3693| 1176569.052| 1891771.704|41.85837259|-87.62735617|(41.8583725929, -...|\n", + "| 6| Gresham| 7808 S Halsted St|Chicago| IL|60620|http://home.chica...|312-745-3617|312-745-3649|312-745-3639| 1172283.013| 1853022.646|41.75213684|-87.64422891|(41.7521368378, -...|\n", + "| 11| Harrison| 3151 W Harrison St|Chicago| IL|60612|http://home.chica...|312-746-8386|312-746-4281|312-746-5151| 1155244.069| 1897148.755|41.87358229|-87.70548813|(41.8735822883, -...|\n", + "| 16| Jefferson Park|5151 N Milwaukee Ave|Chicago| IL|60630|http://home.chica...|312-742-4480|312-742-4421|312-742-4423| 1138480.758| 1933660.473|41.97409445|-87.76614884|(41.9740944511, -...|\n", + "| Headquarters| Headquarters| 3510 S Michigan Ave|Chicago| IL|60653|http://home.chica...| null| null| null| 1177731.401| 1881697.404|41.83070169|-87.62339535|(41.8307016873, -...|\n", + "| 24| Rogers Park| 6464 N Clark St|Chicago| IL|60626|http://home.chica...|312-744-5907|312-744-6928|312-744-7603| 1164193.588| 1943199.401|41.99976348|-87.67132429|(41.9997634842, -...|\n", + "| 2| Wentworth|5101 S Wentworth Ave|Chicago| IL|60609|http://home.chica...|312-747-8366|312-747-5396|312-747-6656| 1175864.837| 1871153.753|41.80181109|-87.63056018|(41.8018110912, -...|\n", + "| 7| Englewood| 1438 W 63rd St|Chicago| IL|60636|http://home.chica...|312-747-8223|312-747-6558|312-747-6652| 1167659.235| 1863005.522|41.77963154|-87.66088702|(41.7796315359, -...|\n", + "| 25| Grand Central| 5555 W Grand Ave|Chicago| IL|60639|http://home.chica...|312-746-8605|312-746-4353|312-746-8383| 1138770.871| 1913442.439|41.91860889|-87.76557448|(41.9186088912, -...|\n", + "| 10| Ogden| 3315 W Ogden Ave|Chicago| IL|60623|http://home.chica...|312-747-7511|312-747-7429|312-747-7471| 1154500.753| 1890985.501|41.85668453|-87.70838196|(41.8566845327, -...|\n", + "| 15| Austin| 5701 W Madison St|Chicago| IL|60644|http://home.chica...|312-743-1440|312-743-1366|312-743-1485| 1138148.815| 1899399.078|41.88008346|-87.76819989|(41.8800834614, -...|\n", + "| 3| Grand Crossing|7040 S Cottage Gr...|Chicago| IL|60637|http://home.chica...|312-747-8201|312-747-5479|312-747-9168| 1182739.183| 1858317.732|41.76643089|-87.60574786|(41.7664308925, -...|\n", + "| 14| Shakespeare|2150 N California...|Chicago| IL|60647|http://home.chica...|312-744-8250|312-744-2422|312-744-8260| 1157304.426| 1914481.521|41.92110332|-87.69745182|(41.9211033246, -...|\n", + "| 8| Chicago Lawn| 3420 W 63rd St|Chicago| IL|60629|http://home.chica...|312-747-8730|312-747-8545|312-747-8116| 1154575.242| 1862672.049|41.77898719|-87.70886382|(41.778987189, -8...|\n", + "| 4| South Chicago| 2255 E 103rd St|Chicago| IL|60617|http://home.chica...|312-747-7581|312-747-5276|312-747-9169| 1193131.299| 1837090.265|41.70793329|-87.56834912|(41.7079332906, -...|\n", + "| 20| Lincoln| 5400 N Lincoln Ave|Chicago| IL|60625|http://home.chica...|312-742-8714|312-742-8803|312-742-8841| 1158399.146| 1935788.826|41.97954951|-87.69284451|(41.9795495131, -...|\n", + "| 18| Near North| 1160 N Larrabee St|Chicago| IL|60610|http://home.chica...|312-742-5870|312-742-5771|312-742-5773| 1172080.029| 1908086.527|41.90324165|-87.64335214|(41.9032416531, -...|\n", + "| 12| Near West|1412 S Blue Islan...| null| null| null| null| null| null| null| null| null| null| null| null|\n", + "|\",Chicago,IL,6060...| -87.6569725149)\"| null| null| null| null| null| null| null| null| null| null| null| null| null|\n", + "| 9| Deering| 3120 S Halsted St|Chicago| IL|60608|http://home.chica...|312-747-8227|312-747-5329|312-747-9172| 1171440.24| 1884085.224|41.83739443|-87.64640771|(41.8373944311, -...|\n", + "+--------------------+-----------------+--------------------+-------+-----+-----+--------------------+------------+------------+------------+------------+------------+-----------+------------+--------------------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ], + "source": [ + "ps = spark.read.csv(\"police-station.csv\", header=True)\n", + "ps.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "hY05yWIEJIdb" + }, + "source": [ + "**The reported crimes dataset has only the district number. Add the district name by joining with the police station dataset.**" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "colab_type": "code", + "id": "QePrc33RI25U", + "outputId": "59d62a27-b7bf-454a-bc5f-b9214cdeb7ad" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "257625" + ] + }, + "execution_count": 48, + "metadata": { + "tags": [] + }, + "output_type": "execute_result" + } + ], + "source": [ + "# Caching the crimes dataset to speed things up, and then, since spark does lazy evaluation, gonna run an action to make it evaluated.\n", + "df.cache()\n", + "df.count()" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 901 + }, + "colab_type": "code", + "id": "lSsxX7qoJinQ", + "outputId": "cf8ced90-8228-4297-a750-ee09b00cbe7d" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+------------+\n", + "| DISTRICT|\n", + "+------------+\n", + "| 7|\n", + "| 15|\n", + "| 11|\n", + "| 3|\n", + "| 8|\n", + "| 22|\n", + "| 16|\n", + "| 5|\n", + "| 18|\n", + "| 17|\n", + "| 6|\n", + "| 19|\n", + "| 25|\n", + "|Headquarters|\n", + "| 24|\n", + "| 9|\n", + "| 1|\n", + "| 20|\n", + "| 10|\n", + "| 4|\n", + "+------------+\n", + "only showing top 20 rows\n", + "\n", + "+--------+\n", + "|District|\n", + "+--------+\n", + "| 009|\n", + "| 012|\n", + "| 024|\n", + "| 031|\n", + "| 015|\n", + "| 006|\n", + "| 019|\n", + "| 020|\n", + "| 011|\n", + "| 025|\n", + "| 003|\n", + "| 005|\n", + "| 016|\n", + "| 018|\n", + "| 008|\n", + "| 022|\n", + "| 001|\n", + "| 014|\n", + "| 010|\n", + "| 004|\n", + "+--------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ], + "source": [ + "ps.select(col('DISTRICT')).distinct().show()\n", + "df.select(col('District')).distinct().show()" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 459 + }, + "colab_type": "code", + "id": "7keEhyBMJsFT", + "outputId": "38229d80-ea97-4213-e727-356ae9e021b4" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---------------+\n", + "|Format_district|\n", + "+---------------+\n", + "| 001|\n", + "| 006|\n", + "| 011|\n", + "| 016|\n", + "| Hea|\n", + "| 024|\n", + "| 002|\n", + "| 007|\n", + "| 025|\n", + "| 010|\n", + "| 015|\n", + "| 003|\n", + "| 014|\n", + "| 008|\n", + "| 004|\n", + "| 020|\n", + "| 018|\n", + "| 012|\n", + "| \",C|\n", + "| 009|\n", + "+---------------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ], + "source": [ + "# Transfomring crime data to remove 0 from beginning, inroder to match the data\n", + "from pyspark.sql.functions import lpad\n", + "ps = ps.withColumn('Format_district',lpad(col('DISTRICT'),3,'0'))\n", + "ps.select(col('Format_district')).show()" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 459 + }, + "colab_type": "code", + "id": "U7Py4EYyKJTN", + "outputId": "8cb9a1eb-78d0-4dfa-c4fb-9256eb9467ad" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------+-----------+-------------------+--------------------+----+--------------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+----+--------------------+--------+--------------+---------------+\n", + "| ID|Case Number| Date| Block|IUCR| Primary Type| Description|Location Description|Arrest|Domestic|Beat|District|Ward|Community Area|FBI Code|Year| Updated On|DISTRICT| DISTRICT NAME|Format_district|\n", + "+--------+-----------+-------------------+--------------------+----+--------------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+----+--------------------+--------+--------------+---------------+\n", + "|11955940| JD121140|2019-07-31 09:00:00| 026XX S HALSTED ST|1154| DECEPTIVE PRACTICE|FINANCIAL IDENTIT...| SMALL RETAIL STORE| false| false|0913| 009| 11| 60| 11|2019|01/20/2020 03:52:...| 9| Deering| 009|\n", + "|11956035| JD121288|2019-09-10 00:01:00|070XX S STONY ISL...|1582|OFFENSE INVOLVING...| CHILD PORNOGRAPHY| RESIDENCE| false| false|0332| 003| 5| 43| 17|2019|01/20/2020 03:52:...| 3|Grand Crossing| 003|\n", + "|11956045| JD121237|2019-12-15 12:00:00| 069XX S ADA ST|0610| BURGLARY| FORCIBLE ENTRY| RESIDENCE| false| false|0734| 007| 6| 67| 05|2019|01/20/2020 03:52:...| 7| Englewood| 007|\n", + "|11956146| JD121344|2019-12-31 00:00:00| 016XX S THROOP ST|0610| BURGLARY| FORCIBLE ENTRY| RESIDENCE| false| false|1233| 012| 25| 31| 05|2019|01/20/2020 03:52:...| 12| Near West| 012|\n", + "|11956304| JD121233|2019-11-27 10:35:00| 002XX W GOETHE ST|0810| THEFT| OVER $500| RESIDENCE| false| false|1821| 018| 2| 8| 06|2019|01/20/2020 03:52:...| 18| Near North| 018|\n", + "|11956126| JD121418|2019-12-23 06:00:00|083XX S MUSKEGON AVE|0820| THEFT| $500 AND UNDER| STREET| false| false|0423| 004| 7| 46| 06|2019|01/20/2020 03:52:...| 4| South Chicago| 004|\n", + "|11956221| JD121487|2019-09-13 17:00:00| 050XX W NELSON ST|2825| OTHER OFFENSE|HARASSMENT BY TEL...| RESIDENCE| false| false|2521| 025| 31| 19| 26|2019|01/20/2020 03:52:...| 25| Grand Central| 025|\n", + "|11956049| JD121196|2019-12-19 19:07:00| 011XX W 83RD ST|1544| SEX OFFENSE|SEXUAL EXPLOITATI...| RESIDENCE| false| false|0613| 006| 21| 71| 17|2019|01/20/2020 03:52:...| 6| Gresham| 006|\n", + "|11956468| JD121650|2019-06-01 08:00:00| 081XX W ADDISON ST|1153| DECEPTIVE PRACTICE|FINANCIAL IDENTIT...| APARTMENT| false| false|1631| 016| 38| 17| 11|2019|01/20/2020 03:52:...| 16|Jefferson Park| 016|\n", + "|11955888| JD121051|2019-12-01 00:01:00| 031XX S PRAIRIE AVE|1152| DECEPTIVE PRACTICE|ILLEGAL USE CASH ...| RESIDENCE| false| false|0211| 002| 4| 35| 11|2019|01/20/2020 03:52:...| 2| Wentworth| 002|\n", + "|11956127| JD121334|2019-12-14 22:00:00|023XX N MILWAUKEE...|0810| THEFT| OVER $500| OTHER| false| false|1414| 014| 1| 22| 06|2019|01/20/2020 03:52:...| 14| Shakespeare| 014|\n", + "|11956215| JD121376|2019-12-12 19:00:00| 0000X E CHESTNUT ST|0890| THEFT| FROM BUILDING| RESTAURANT| false| false|1833| 018| 42| 8| 06|2019|01/20/2020 03:52:...| 18| Near North| 018|\n", + "|11956004| JD121156|2019-11-15 12:00:00| 0000X N LATROBE AVE|1310| CRIMINAL DAMAGE| TO PROPERTY| RESIDENCE| false| false|1522| 015| 28| 25| 14|2019|01/20/2020 03:52:...| 15| Austin| 015|\n", + "|11956129| JD121379|2019-11-03 15:00:00| 104XX S AVENUE J|2825| OTHER OFFENSE|HARASSMENT BY TEL...| OTHER| false| false|0432| 004| 10| 52| 26|2019|01/20/2020 03:52:...| 4| South Chicago| 004|\n", + "|11956041| JD121266|2019-12-21 14:18:00|025XX S CALIFORNI...|1752|OFFENSE INVOLVING...|AGG CRIM SEX ABUS...| APARTMENT| false| false|1033| 010| 12| 30| 17|2019|01/20/2020 03:52:...| 10| Ogden| 010|\n", + "|11935337| JC563784|2019-12-29 02:25:00| 021XX E 70TH ST|143A| WEAPONS VIOLATION|UNLAWFUL POSS OF ...| ALLEY| true| false|0331| 003| 5| 43| 15|2019|01/20/2020 03:49:...| 3|Grand Crossing| 003|\n", + "|11927831| JC553936|2019-12-20 13:15:00|082XX S JEFFERY BLVD|0545| ASSAULT|PRO EMP HANDS NO/...| STREET| true| false|0414| 004| 8| 46| 08A|2019|01/20/2020 03:49:...| 4| South Chicago| 004|\n", + "|11927034| JC549451|2019-12-16 19:15:00| 076XX S CICERO AVE|0820| THEFT| $500 AND UNDER| SMALL RETAIL STORE| false| false|0833| 008| 18| 65| 06|2019|01/20/2020 03:49:...| 8| Chicago Lawn| 008|\n", + "|11926969| JC553305|2019-12-19 21:30:00| 016XX W LAKE ST|0486| BATTERY|DOMESTIC BATTERY ...| RESIDENCE| true| false|1224| 012| 27| 28| 08B|2019|01/20/2020 03:49:...| 12| Near West| 012|\n", + "|11926797| JC553193|2019-12-19 19:04:00| 018XX W DIVISION ST|0620| BURGLARY| UNLAWFUL ENTRY| RESIDENCE| true| false|1212| 012| 1| 24| 05|2019|01/20/2020 03:49:...| 12| Near West| 012|\n", + "+--------+-----------+-------------------+--------------------+----+--------------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+----+--------------------+--------+--------------+---------------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ], + "source": [ + "# Executing the join and deleting some column so that we don't get too much data\n", + "df.join(ps, df.District == ps.Format_district, 'left_outer').drop(\n", + " 'ADDRESS',\n", + " 'CITY',\n", + " 'STATE',\n", + " 'ZIP',\n", + " 'WEBSITE',\n", + " 'PHONE',\n", + " 'FAX',\n", + " 'TTY',\n", + " 'X COORDINATE',\n", + " 'Y COORDINATE',\n", + " 'LATITUDE',\n", + " 'LONGITUDE',\n", + " 'LOCATION').show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "EEEB2TVqL4Ie" + }, + "source": [ + "# Hands-on again!" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "50gdkX-_MEMv" + }, + "source": [ + "**What is the most frequently reported non-criminal activity?**" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "pbjuihaOL-fM" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "UdMXUsjKME2y" + }, + "source": [ + "**Find the day of the week with the most reported crime?**" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "7Sdq1ZwRMN0p" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "REpHzg_nNctg" + }, + "source": [ + "**Using a bar chart, plot which day of the week has the most number of reported crime.**" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 51 + }, + "colab_type": "code", + "id": "j5olv2qkNfPa", + "outputId": "c6c14f40-9415-474b-fbc2-fabfb3596f30" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['Sun', 'Mon', 'Thu', 'Sat', 'Wed', 'Tue', 'Fri']\n", + "[20036, 21041, 20701, 21786, 20652, 21331, 22147]\n" + ] + } + ], + "source": [ + "from pyspark.sql.functions import date_format, col\n", + "dow = [x[0] for x in df.groupBy(date_format(col('Date'),'E')).count().collect()]\n", + "print(dow)\n", + "cnt = [x[1] for x in df.groupBy(date_format(col('Date'),'E')).count().collect()]\n", + "print(cnt)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 204 + }, + "colab_type": "code", + "id": "gzOCDOqFOOc6", + "outputId": "860dba1a-8e8d-4629-bc80-43d60a59becb" + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Day_of_weekCount
0Sun20036
1Mon21041
2Thu20701
3Sat21786
4Wed20652
\n", + "
" + ], + "text/plain": [ + " Day_of_week Count\n", + "0 Sun 20036\n", + "1 Mon 21041\n", + "2 Thu 20701\n", + "3 Sat 21786\n", + "4 Wed 20652" + ] + }, + "execution_count": 203, + "metadata": { + "tags": [] + }, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "\n", + "cp = pd.DataFrame({'Day_of_week':dow, 'Count':cnt})\n", + "cp.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 325 + }, + "colab_type": "code", + "id": "J4pxrZeSOvIS", + "outputId": "c1511ea1-19b9-4603-c10c-3e6c85fcbae6" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0.5, 1.0, 'No.of reported crimes per day')" + ] + }, + "execution_count": 208, + "metadata": { + "tags": [] + }, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZUAAAEjCAYAAAD6yJxTAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAIABJREFUeJzt3WmYHVW59vH/TQgkkEAgBAQCBBFQ\nZkIYREFEBUQmEREcCMgB8YjKERD0VYLiACjiQQWMjHFgUKaoIHI4RCYRkjAPSg4EEiAhhClECJA8\n74daTSqb7t3V3bW7utL377rq6l2rpmfvdPbTtdaqtRQRmJmZlWGZqgMwM7Olh5OKmZmVxknFzMxK\n46RiZmalcVIxM7PSOKmYmVlpnFSsT5G0hqSbJc2TdEbV8bSCpJD0rhLOs5Okf5YRU11IOlTSrVXH\nYR1zUrFOSZou6VlJK+bK/kPSpBZc7kjgOWCliDi2Befvkb70pRYRt0TExlXHYZbnpGJFDQC+2gvX\nWQ94KAo+lStpQIvjyV9r2d66Vmf6UixlWRrfU3/kpGJF/Qg4TtKw9jZK2lHSXZJeSj937OhEHe0r\n6SJgLPB1Sa9I+nA7x14k6RxJ10qaD3xQ0vKSfizpSUmzJZ0raXDafxdJMyV9U9Jz6a7rM7nzrSxp\ngqQ5kp6Q9C1Jy6Rth0q6TdKZkuYClwHnAu9N8b2Y9uvw+mn78ZKekfS0pM83+5AlrSrpwrTvC5Ku\nbngfJ0iaBVzYVpY7dnq61n2S5ks6P1UnXpeqE/9H0iq5/XeQdLukFyXdK2mX3LZDJT2Wjns8/5k1\nxHuypD9IuiztO1XSlrnta0m6In2+j0v6SjvH/kbSy8Ch7Zx/uKSJkl6WdCewQcP2/5Y0I22fImmn\nVP4OSf+WNDy37+gUx8Bm/wbWQxHhxUvTBZgOfBi4EvheKvsPYFJ6vSrwAvA5YFng4LQ+vJ1zNd0X\nuKjtGh3EchHwEvA+sj+KBgFnAhPTuYcCfwR+mPbfBXgT+AmwPPABYD6wcdo+AbgmHTcK+BdweNp2\naDr2yynWwans1oaYml1/D2A2sBmwIvA7IIB3dfD+/kyWvFYBBgIfaHgfp6X3MTiVzWz4d7oDWANY\nG3gWmApsnT6n/wXGpX3XBuYCe6bP8SNpfUSK8+XcZ7QmsGkH8Z4MvAEckOI9Dng8vV4GmAKcBCwH\nvBN4DNi94dj90r6D2zn/pcDlKabNgKfynz/wWWB4+vc5FpgFDErbrgW+2PDv9LOq/z8t7UvlAXjp\n+wuLk8pmZF/oI1gyqXwOuLPhmL8Dh7Zzrqb7UiypTMitiyxJbJArey/weHrd9mW8Ym775cC3yar0\nXgc2yW37Qu59HQo82XD9Qxu+1Dq7/gXAqbltG9FBUklf3ouAVdrZtkuKdVBDWWNS+Uxu/QrgnNz6\nl4Gr0+sTgF83XON6sjvFFYEXgU+090XfcMzJwB259WWAZ4CdgO3b+fy+AVyYO/bmJuceQJZ03p0r\n+wENSb3hmBeALdPrTwG35c41C9iu6v9PS/viOkwrLCIekPQn4ETg4dymtYAnGnZ/guyv4UZd2bcj\nM3KvRwArAFMktZWJ7EukzQsRMb/hemsBq5H9Rf1Ew7Z8LPlrtaez669F9td6/vwdWQd4PiJe6GD7\nnIh4rZN4Zudev9rO+pD0ej3gk5L2zm0fCNwUEfMlfYrsruN8SbcBx0bEIx1c863PKCIWpSq5tciS\n51pt1YTJAOCW9o5txwiyO5D8Pkt8fpKOAw7PXW8lsn9XyO5Az5W0PrAx8FJE3NnkelYCt6lYV40D\njmDJL96nyb6k8tYlq6po1JV9O5JvxH+O7Mty04gYlpaVI2JIbp9VlOu5lq73dDr2jYZ4GmNp7DDQ\nuN7Z9Z8hSxb583dkBrBqR+1W7Vy7J2aQ3akMyy0rRsSpABFxfUR8hOzu6RHgV03O9db7S+1RI8k+\n3xlkd2z5awyNiD0Lvqc5ZHeZ7X5+qf3k68CBZHd3w8jupJXew2tkd6WfJbtD/nWTa1lJnFSsSyJi\nGlmd/1dyxdcCG0n6tKRl01+5mwB/aucUXdm3SDyLyL7wzpS0OoCktSXt3rDrdyQtl76I9gJ+HxEL\nyb50vi9pqKT1gK8Bv2lyydnASEnLFbz+5cChkjaRtAJZUu7ovTwDXAecLWkVSQMl7dyVz6MLfgPs\nLWl3SQMkDUoN/yNT4/6+KREvAF4hq5bryDaS9lfWe+uYdMwdwJ3AvNS5YHC6zmaSti0SYPr3uRI4\nWdIKkjYhq55rM5Qs6cwBlpV0EtmdSt4EsirLfXBS6RVOKtYd3yWrdwcgIuaSfVEfS9bY+3Vgr4h4\nDiD1hjq3yL7ddAIwDbgj9SL6H7LqjjazyOranwZ+CxyVq8r5MlmbyGPArWQN6Rc0udb/Ag8CsyS1\nxdzh9SPiOuCn6bhp6WcznyO7e3qErKH9mE7275aImAHsC3yT7Et5BnA82XfCMmTJ9WngebLODV9s\ncrpryNov2jpg7B8Rb6SksBewFVnj/XPAecDKXQj1aLIqu1lk7WkX5rZdD/yFrHPFE8BrNFSnRcRt\nZAlxakQ0q3q0kijCk3TZ0it1k/1NRIysOpalkaSTyTodfLbqWDoi6X+B30XEeVXH0h+4od7Mllqp\nqm002V2Z9QJXf5nZUknSxWRVkcdExLyq4+kvXP1lZmal8Z2KmZmVxknFzMxK0+8a6ldbbbUYNWpU\n1WGYmdXKlClTnouIEZ3t1++SyqhRo5g8eXLVYZiZ1YqkQs/5uPrLzMxK46RiZmalcVIxM7PS9Ls2\nFTOzjrzxxhvMnDmT117rbIaBpdegQYMYOXIkAwd2b4JMJxUzs2TmzJkMHTqUUaNGkZsfp9+ICObO\nncvMmTNZf/31u3UOV3+ZmSWvvfYaw4cP75cJBUASw4cP79GdmpOKmVlOf00obXr6/p1UzMz6kFmz\nZnHQQQexwQYbsM0227Dnnnvyr3/9q7TzT5o0idtvv7208zVym0pnWv1Xiwf0NOu7yv7/38n/94jg\n4x//OGPHjuXSSy8F4N5772X27NlstNFGpYQwadIkhgwZwo477ljK+Rr5TsXMrI+46aabGDhwIEcd\nddRbZVtuuSXvf//7Of7449lss83YfPPNueyyy4AsQey1115v7Xv00Udz0UUXAdnoIePGjWP06NFs\nvvnmPPLII0yfPp1zzz2XM888k6222opbbrml9PfgOxUzsz7igQceYJtttnlb+ZVXXsk999zDvffe\ny3PPPce2227Lzjvv3On5VlttNaZOncrZZ5/Nj3/8Y8477zyOOuoohgwZwnHHHdeKt+A7FTOzvu7W\nW2/l4IMPZsCAAayxxhp84AMf4K677ur0uP333x+AbbbZhunTp7c4yoyTytJMau1iZqXadNNNmTJl\nSuH9l112WRYtWvTWemNX4OWXXx6AAQMG8Oabb5YTZCecVMzM+ohdd92VBQsWMH78+LfK7rvvPoYN\nG8Zll13GwoULmTNnDjfffDPbbbcd6623Hg899BALFizgxRdf5MYbb+z0GkOHDmXevNbNruw2FTOz\nPkISV111FccccwynnXYagwYNYtSoUfz0pz/llVdeYcstt0QSp59+Ou94xzsAOPDAA9lss81Yf/31\n2XrrrTu9xt57780BBxzANddcw89+9jN22mmnct9Df5ujfsyYMdGl+VTq3KW4zrGbVeDhhx/mPe95\nT9VhVK69z0HSlIgY09mxvlOxvstJ0ax23KZiZmalcVIxM7PSuPrLrFVcfVdLEdGvB5XsaTu771TM\nzJJBgwYxd+7cHn+x1lXbfCqDBg3q9jl8p2JmlowcOZKZM2cyZ86cqkOpTNvMj93lpGJm7at79V03\n4h8IFJ7vsJ/ezXTG1V9mZlYaJxUzMytNy5KKpHUk3STpIUkPSvpqKl9V0g2SHk0/V0nlknSWpGmS\n7pM0OneusWn/RyWNzZVvI+n+dMxZ6s9dNszM+oBW3qm8CRwbEZsAOwBfkrQJcCJwY0RsCNyY1gE+\nCmyYliOBcyBLQsA4YHtgO2BcWyJK+xyRO26PFr4fMzPrRMuSSkQ8ExFT0+t5wMPA2sC+wMVpt4uB\n/dLrfYEJkbkDGCZpTWB34IaIeD4iXgBuAPZI21aKiDsi6/83IXcuMzOrQK/0/pI0Ctga+AewRkQ8\nkzbNAtZIr9cGZuQOm5nKmpXPbKe8vesfSXb3w7rrrtv9N2Jm1ltq2vuu5Q31koYAVwDHRMTL+W3p\nDqPl/fIiYnxEjImIMSNGjGj15czM+q2WJhVJA8kSym8j4spUPDtVXZF+PpvKnwLWyR0+MpU1Kx/Z\nTrmZmVWklb2/BJwPPBwRP8ltmgi09eAaC1yTKz8k9QLbAXgpVZNdD+wmaZXUQL8bcH3a9rKkHdK1\nDsmdy8zMKtDKNpX3AZ8D7pd0Tyr7JnAqcLmkw4EngAPTtmuBPYFpwL+BwwAi4nlJpwB3pf2+GxHP\np9f/CVwEDAauS4uZmVXEMz92pqaNZUC9YwfH3xnH35zjb66L8Red+dFP1JuZWWmcVMzMrDROKmZm\nVhonFTMzK42TipmZlcZJxczMSuOkYmZmpek0qUg6XdJKkgZKulHSHEmf7Y3gzMysXorcqeyWBoLc\nC5gOvAs4vpVBmZlZPRVJKm1DuXwM+H1EvNTCeMzMrMaKjP31J0mPAK8CX5Q0AnittWGZmVkddXqn\nEhEnAjsCYyLiDbLBHvdtdWBmZlY/RRrqVyAbDficVLQW0OmgYmZm1v8UaVO5EHid7G4Fsomwvtey\niMzMrLaKJJUNIuJ04A2AiPg30OIxmc3MrI6KJJXXJQ0mzSUvaQNgQUujMjOzWirS+2sc8BdgHUm/\nJZvR8dBWBmVmZvXUaVKJiBskTQV2IKv2+mpEPNfyyMzMrHaKjv21NjAAWA7YWdL+rQvJzMzqqtM7\nFUkXAFsADwKLUnEAV7YwLjMzq6EibSo7RMQmLY/EzMxqr0j1198lOamYmVmnitypTCBLLLPIuhIL\niIjYoqWRmZlZ7RRJKucDnwPuZ3GbipmZ2dsUSSpzImJiyyMxM7PaK5JU7pb0O+CP5J6kjwj3/jIz\nsyUUSSqDyZLJbrkydyk2M7O3KfJE/WG9EYiZmdVfh0lF0tcj4nRJPyMNJpkXEV9paWRmZlY7ze5U\nHk4/J/dGIGZmVn8dJpWI+KOkAcDmEXFcL8ZkZmY11fSJ+ohYSDbUvZmZWaeK9P66R9JE4PfA/LZC\ndyk2M7NGRZLKIGAusGuuzF2Kzczsbdyl2MzMStPpKMWSLpY0LLe+SppjxczMbAlFhr7fIiJebFuJ\niBeArVsXkpmZ1VWRpLKMpFXaViStSsEZIyU9K+mBXNnJkp6SdE9a9sxt+4akaZL+KWn3XPkeqWya\npBNz5etL+kcqv0zSckXesJmZtU6RpHIG2Xwqp0g6BbgdOL3AcRcBe7RTfmZEbJWWawHSJGAHAZum\nY86WNCA9J/ML4KPAJsDBuQnDTkvnehfwAnB4gZjMzKyFOk0qETEB2B+YnZb9I+LXBY67GXi+YBz7\nApdGxIKIeByYBmyXlmkR8VhEvA5cCuwrSWS90f6Qjr8Y2K/gtczMrEWKdCkmIh4CHirpmkdLOoRs\n+JdjUxvN2sAduX1mpjKAGQ3l2wPDgRcj4s129jczs4oUqf4q0znABsBWwDNkVWstJ+lISZMlTZ4z\nZ05vXNLMrF/q1aQSEbMjYmFELAJ+RVa9BfAUsE5u15GprKPyucAwScs2lHd03fERMSYixowYMaKc\nN2NmZm/Tq0lF0pq51Y8DbT3DJgIHSVpe0vrAhsCdwF3Ahqmn13JkjfkTIyKAm4AD0vFjgWt64z2Y\nmVnHms2nMo925lFpExErNTuxpEuAXYDVJM0ExgG7SNoqnXc68IV0rgclXU7WbvMm8KU0mCWSjgau\nBwYAF0TEg+kSJwCXSvoecDdwfmdv1szMWkvZH/1Ndsi6ET8D/BoQ8BlgzYg4qfXhlW/MmDExeXIX\npoiRWhcMQCeff4/UOXZw/J1x/M05/ua6GL+kKRExprP9ilR/7RMRZ0fEvIh4OSLOIesCbGZmtoQi\nSWW+pM+khxGXkfQZckPgm5mZtSmSVD4NHMjihx8/mcrMzMyWUGTo++m4usvMzAooMvT9RpJubBsY\nUtIWkr7V+tDMzKxuilR//Qr4BvAGQETcR/a8iJmZ2RKKJJUVIuLOhrI3293TzMz6tSJJ5TlJG5Ae\nhJR0ANlzK2ZmZksoMkrxl4DxwLslPQU8TvYApJmZ2RKKJJWIiA9LWhFYJiLmpfG5zMzMllCk+usK\ngIiYHxHzUtkfmuxvZmb9VLMBJd9NNr3vypL2z21aCRjU6sDMzKx+mlV/bQzsBQwD9s6VzwOOaGVQ\nZmZWTx0mlYi4RtKfgBMi4ge9GJOZmdVU0zaVNKfJfr0Ui5mZ1VyR3l+3Sfo5cBm50YkjYmrLojIz\ns1oqklS2Sj+/mysLYNfywzEzszorMkrxB3sjEDMzq78ioxSvLOknkian5QxJK/dGcGZmVi9FHn68\ngKwb8YFpeRm4sJVBmZlZPRVpU9kgIj6RW/+OpHtaFZCZmdVXkTuVVyW9v21F0vuAV1sXkpmZ1VWR\nO5UvAhendhQBzwNjWxqVmZnVUpHeX/cAW0paKa2/3PKozMyslor0/hou6SxgEnCTpP+WNLzlkZmZ\nWe0UaVO5FJgDfAI4IL2+rJVBmZlZPRVpU1kzIk7JrX9P0qdaFZCZmdVXkTuVv0o6SNIyaTkQuL7V\ngZmZWf0USSpHAL8DXgcWkFWHfUHSPElutDczs7cU6f01tDcCMTOz+ivS+0uSPivp22l9HUnbtT40\nMzOrmyLVX2cD7wU+ndZfAX7RsojMzKy2ivT+2j4iRku6GyAiXpC0XIvjMjOzGipyp/KGpAFkE3Mh\naQSwqKVRmZlZLRVJKmcBVwGrS/o+cCvwg5ZGZWZmtVSk99dvJU0BPkQ2oOR+EfFwyyMzM7PaaZpU\nUrXXgxHxbuCR3gnJzMzqqmn1V0QsBP4pad1eisfMzGqsSJvKKsCDkm6UNLFt6ewgSRdIelbSA7my\nVSXdIOnR9HOVVC5JZ0maJuk+SaNzx4xN+z8qaWyufBtJ96djzpKkrr11MzMrW5Euxd/u5rkvAn4O\nTMiVnQjcGBGnSjoxrZ8AfBTYMC3bA+cA20taFRgHjCHrfTZF0sSIeCHtcwTwD+BaYA/gum7GamZm\nJSjSUP+37pw4Im6WNKqheF9gl/T6YrI5Wk5I5RMiIoA7JA2TtGba94aIeB5A0g3AHpImAStFxB2p\nfAKwH04qZmaVKlL9VaY1IuKZ9HoWsEZ6vTYwI7ffzFTWrHxmO+VmZlah3k4qb0l3JdEb15J0pKTJ\nkibPmTOnNy5pZtYvdZhUJN2Yfp5W4vVmp2ot0s9nU/lTwDq5/UamsmblI9spb1dEjI+IMRExZsSI\nET1+E2Zm1r5mdyprStoR2EfS1pJG55duXm8i0NaDayxwTa78kNQLbAfgpVRNdj2wm6RVUk+x3YDr\n07aXJe2Qen0dkjuXmZlVpFlD/UlkPb9GAj9p2BbArs1OLOkSsob21STNJOvFdSpwuaTDgSeAA9Pu\n1wJ7AtOAfwOHAUTE85JOAe5K+323rdEe+E+yHmaDyRro3UhvZlYxZU0bTXaQvt0wR32tjRkzJiZP\nnlz8gFY//tLJ598jdY4dHH9nHH9zjr+5LsYvaUpEjOlsvyJdik+RtA+wcyqaFBF/6lI0ZmbWLxSZ\n+fGHwFeBh9LyVUkepdjMzN6myBP1HwO2iohFAJIuBu4GvtnKwMzMrH6KPqcyLPd65VYEYmZm9Vfk\nTuWHwN2SbiKbT2VnsjG7zMzMllCkof6SNNbWtqnohIiY1dKozMyslorcqZAeNux0uHszM+vfKhv7\ny8zMlj5OKmZmVpqmSUXSAEmem97MzArxHPVmZlaaIg31bXPU3wnMbyuMiH1aFpWZmdVSK+eoNzOz\nfqbQHPWS1gM2jIj/kbQCMKD1oZmZWd0UGVDyCOAPwC9T0drA1a0MyszM6qlIl+IvAe8DXgaIiEeB\n1VsZlJmZ1VORpLIgIl5vW5G0LNnMj2ZmZksoklT+JumbwGBJHwF+D/yxtWGZmVkdFUkqJwJzgPuB\nL5DNJ/+tVgZlZmb1VKT316I0Mdc/yKq9/hmdTWxvZmb9UqdJRdLHgHOB/yObT2V9SV+IiOtaHZyZ\nmdVLkYcfzwA+GBHTACRtAPwZcFIxM7MlFGlTmdeWUJLHgHktisfMzGqswzsVSfunl5MlXQtcTtam\n8kngrl6IzczMaqZZ9dfeudezgQ+k13OAwS2LyMzMaqvDpBIRh/VmIGZmVn9Fen+tD3wZGJXf30Pf\nm5lZoyK9v64Gzid7in5Ra8MxM7M6K5JUXouIs1oeiZmZ1V6RpPLfksYBfwUWtBVGxNSWRWVmZrVU\nJKlsDnwO2JXF1V+R1s3MzN5SJKl8Enhnfvh7MzOz9hR5ov4BYFirAzEzs/orcqcyDHhE0l0s2abi\nLsVmZraEIkllXMujMDOzpUKR+VT+1huBmJlZ/RV5on4ei+ekXw4YCMyPiJVaGZiZmdVPkTuVoW2v\nJQnYF9ihlUGZmVk9Fen99ZbIXA3s3pOLSpou6X5J90ianMpWlXSDpEfTz1VSuSSdJWmapPskjc6d\nZ2za/1FJY3sSk5mZ9VyR6q/9c6vLAGOA10q49gcj4rnc+onAjRFxqqQT0/oJwEeBDdOyPXAOsL2k\nVck6EYwhq56bImliRLxQQmxmZtYNRXp/5edVeROYTlYFVrZ9gV3S64uBSWRJZV9gQkQEcIekYZLW\nTPveEBHPA0i6AdgDuKQFsZmZWQFF2lRaMa9KAH+VFMAvI2I8sEZEPJO2zwLWSK/XBmbkjp2Zyjoq\nNzOzijSbTvikJsdFRJzSg+u+PyKekrQ6cIOkRxpPnhJOKSQdCRwJsO6665Z1WjMza9CsoX5+OwvA\n4WTVUt0WEU+ln88CVwHbAbNTtRbp57Np96eAdXKHj0xlHZW3d73xETEmIsaMGDGiJ6GbmVkTHSaV\niDijbQHGk81LfxhwKfDO7l5Q0oqShra9BnYjG19sItDWg2sscE16PRE4JPUC2wF4KVWTXQ/sJmmV\n1FNst1RmZmYVadqmknpYfQ34DFnj+egSeletAVyVPfLCssDvIuIvaWyxyyUdDjwBHJj2vxbYE5gG\n/JsssRERz0s6Bbgr7ffdtkZ7MzOrRrM2lR8B+5PdpWweEa+UccGIeAzYsp3yucCH2ikP4EsdnOsC\n4IIy4jIzs55r1qZyLLAW8C3gaUkvp2WepJd7JzwzM6uTDu9UIqJLT9ubmZk5cZiZWWmcVMzMrDRO\nKmZmVhonFTMzK42TipmZlcZJxczMSuOkYmZmpXFSMTOz0jipmJlZaZxUzMysNE4qZmZWGicVMzMr\njZOKmZmVxknFzMxK46RiZmalcVIxM7PSOKmYmVlpnFTMzKw0TipmZlYaJxUzMyuNk4qZmZXGScXM\nzErjpGJmZqVxUjEzs9I4qZiZWWmcVMzMrDROKmZmVhonFTMzK42TipmZlcZJxczMSuOkYmZmpXFS\nMTOz0jipmJlZaZxUzMysNE4qZmZWGicVMzMrTe2TiqQ9JP1T0jRJJ1Ydj5lZf1brpCJpAPAL4KPA\nJsDBkjapNiozs/6r1kkF2A6YFhGPRcTrwKXAvhXHZGbWby1bdQA9tDYwI7c+E9i+cSdJRwJHptVX\nJP2zhTGtBjxXeG+pdZF0XZ1jB8dfNcdfrVbHv16RneqeVAqJiPHA+N64lqTJETGmN65VtjrHDo6/\nao6/Wn0l/rpXfz0FrJNbH5nKzMysAnVPKncBG0paX9JywEHAxIpjMjPrt2pd/RURb0o6GrgeGABc\nEBEPVhxWr1SztUidYwfHXzXHX60+Eb8iouoYzMxsKVH36i8zM+tDnFTMzKw0TipmZlYaJ5V+TtLy\nRcr6IknvknS9pHvT+haSvlF1XP2FpMclPda4VB2XVcsN9T0g6fKIOFDS/UD+gxQQEbFFRaEVJmlq\nRIzurKwvkjQJ+Cbwi4jYWpKAByJi02oj65o0ht0a5HpjRsST1UVUjKThudVBwCeBVSPipIpCKkTS\n15ptj4if9FYsPdUXf3dq3aW4D/hq+rlXpVF0g6R3kA1zM1jS1mSJEGAlYIXKAuuaFSPidqXhJiIi\nJL1RcUxdIunLwDhgNrAoFQfQ5/8giYi5DUU/lTQF6NNJBRiafm4MbMviZ9v2Bu6sJKJu6Ku/O04q\nPRARz6S/FC6KiA9WHU8X7Q4cSjYKQf4vs3lkf/3XwVxJ65PuEiXtB8yqNqQu+yqwcTtf0H2epPzd\n7DLAGGrwnRIR3wGQdDMwOiLmpfWTgT9XGFpX9cnfnT7/C9DXRcRCSYskrRwRL1UdT1ERcTFwsaRP\nRMQVVcfTTUcD5wPvlvQE8AxwcLUhddkMoDa/Nw3OyL1+E5gOHFhNKN2yBvB6bv31VFYXffJ3x0ml\nHK8A90u6AZjfVhgRX6kupGIi4gpJHwM2JasXbyv/bnVRFRMR04BdJa1M1j74YtUxdcNjwCRJfwYW\ntBXWoV6/hnfnjSYAd0q6Kq3vB1xcYTxd1Sd/d5xUynFlWmpH0rlkbSgfBM4DDqAm9cqSvtmwDkBE\n/KCSgLrnybQsl5baSL0EPwGMYsmG4j7/BwlARHxf0nXATqnosIi4u8qYuqhP/u6491cPSFq36p4W\nPSXpvojYIvdzCHBdROzU6cEVk3RCbnUQ8DHgwYg4rKKQui197kTEK1XHUpSkv5BVv0wBFraVR8QZ\nHR7Ux0h6P7BhRFwoaQQwJCIerzquOvOdSs9cDYwGkHRFRHyi4ni649X089+S1gKeB9asMJ7CIuK0\n/Lqk04C/VBROt0jaDPg1sGpafw44pA8MjFrEyIjYo+ogukvSOLLOBRsDFwIDgd8A76syrqIk3cSS\njzIAEBG7VhDOW5xUeiY/ddo7K4uiZ/4kaRhwOtlfnJBVg9XR8mS92epkPPC1iLgJQNIuwK+AHasM\nqqDbJW0eEfdXHUg3fRzYGpgKEBFPSxra/JA+5bjc60FkVZFvVhTLW5xUeiY6eN3nSdoWmBERp6T1\nIcD9wCPAmVXG1hlJy6ZpD+6TG2doAAAF+0lEQVRm8ec+gOwOq07tKZA9a3NT20pETJK0YpUBdUbS\nA2TPRSwLHJaeol9AjR76TV5Pzza1dUnv0597o4iY0lB0m6TK20OdVHpmS0kvk/1nGpxew+L/XCtV\nF1qnfgl8GEDSzsCpwJeBrcj+ej6gutA6dSdZtWM+xjeBWRGxoP1D+qzHJH2brAoM4LNkvXr6srXJ\nfk/q7nJJvwSGSToC+DzZXWItSFo1t9r2nNDKFYXzFieVHoiIAVXH0AMDIuL59PpTwPj0vMoVku6p\nMK4iBBAR/1d1ICX4PPAdFvcevCWV9WWPR8QTVQfRXZKOAW4HfkrW6/FlsnaVkyLihipj66IpLL5T\nb3tO6PDKokmcVPqvAW3VSMCHgCNz2/r678WIZuM3Vd1Pvysi4gWgzz/P1GD1mn/+I8kSyrvJqnxv\nI0syjdVJfVKu6nr9tD6WrD1lOvBQhaEBff/Lw1rnEuBvqbfRq2R/ISPpXfTBp3QbDACGsGRHiVqR\nNLHZ9ojYp7di6YZaf/4RcRyApOXIqox2BA4Dxkt6MSI2qTK+Ahqrrn9IH6q6dlLpp9KDXzeSNW7/\nNRY/sLQM2S9oX/ZMXR6wa+K9ZMNsXAL8g3p9QS8Nnz/AYLIBVFdOy9Nkdy59XZ+uunZS6cci4o52\nyv5VRSxdVKcv4I68A/gI2VhlnyYbyPCSmjyfUuvPX9J4smGJ5pEl9NuBn6SqyDro01XXnqTL6uhD\nVQfQUxGxMCL+EhFjgR2AaWTjOB1dcWhF1P3zX5fsmaZZwFPATKBO48a1VV1fQx+suvYwLWYVSWNn\nfYzsbmUU2bweF0TEU1XG1R+kCd02JWtP2RHYjGw0ib9HxLgqYytC0g4srrqen8o2IhtmZmqlsTmp\nmPU+SRPIvsiuBS6NiAcqDqlfkjSSbFiWHckm2xseEcOqjarenFTMKiBpEYunSWhvKuq+/OBsrUn6\nCovvUN4ga1NpW+6PiEVNDrdOVN6oY9YfRYTbM6szCvg98F8R8UzFsSx1fKdiZmal8V9LZmZWGicV\nMzMrjZOKWQNJCyXdI+lBSfdKOlZSS/+vSPpRut6PWnydUWnoerOWcEO92du9GhFbAUhaHfgd2XAe\nrXx+4Uhg1YhY2OmeZn2Y71TMmoiIZ8m+8I9WZpSkWyRNTcuOkD13Imm/tuMk/VbSvvlzpeN/JOkB\nSfdL+lQqn0g2QOOUtrLcMfdLGpaOnSvpkNz1PiJpQDrnXZLuk/SF3LHH58q/0/jeJL1T0t1p1Fuz\nUvhOxawTEfGYpAHA6sCzwEci4jVJG5INmTEGOB/4L+BqSSuTPQMxtuFU+5ONJLslsBpwl6SbI2If\nSa+03R01uI3s4bwnyCbv2gmYQDYg5RfJ5s94KSK2TU/o3ybpr8CGadmO7NmXiWlE2ycBJG0MXAoc\nGhH39vxTMss4qZh1zUDg55K2AhYCGwFExN8knS1pBNncFlekAf/y3k82aORCYLakvwHbkg3P0pFb\ngJ3Jkso5wJGS1gZeiIj5knYDtpDUNtz5ymTJZLe03J3Kh6TyJ4ERwDXA/hFR+fwbtnRxUjHrhKR3\nkiWQZ8naVWaT3W0sA7yW23UC2XTAB5HNz1GGm4EvkQ2C+P+Aj5PNl3FLW3jAlyPi+oaYdwd+GBG/\nbCgfRTbo4JNkSc5JxUrlNhWzJtKdx7nAz9OcMyuTzSeyCPgc2YRVbS4CjgHo4A7gFuBTqR1kBNkd\nyJ3Nrh8RM8iqyjaMiMeAW4HjyJINwPXAFyUNTPFuJGnFVP55SUNS+dqp0wHA62TJ6RBJny78YZgV\n4DsVs7cbnCY7Gkg29/evgbYpcs8mmwzpEOAvLB6/i4iYLelh4OoOznsVWVvIvWTjfX09ImYViOcf\nLE5et5DN9HdrWj+PbNiRqWnk3TnAfhHxV0nvAf6eFfMK2V3UwhTrfEl7ATek9pymM1GaFeVhWsxK\nImkFspkDR0dE5fNamFXB1V9mJZD0YeBh4GdOKNaf+U7FzMxK4zsVMzMrjZOKmZmVxknFzMxK46Ri\nZmalcVIxM7PSOKmYmVlp/j9ruWYWoc0OMQAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "tags": [] + }, + "output_type": "display_data" + } + ], + "source": [ + "cp.sort_values('Count', ascending=False).plot(kind='bar', color= 'red', x='Day_of_week', y='Count')\n", + "plt.xlabel(\"Day of week\")\n", + "plt.ylabel(\"Number of reported crimes\")\n", + "plt.title(\"No.of reported crimes per day\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "x62BiCgBMOtq" + }, + "source": [ + "# RDD" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "VGXK6uEuUKRh" + }, + "source": [ + "> With map, you define a function and then apply it record by record. Flatmap returns a new RDD by first applying a function to all of the elements in RDDs and then flattening the result. Filter, returns a new RDD. Meaning only the elements that satisfy a condition. With reduce, we are taking neighboring elements and producing a single combined result.\n", + "For example, let's say you have a set of numbers. You can reduce this to its sum by providing a function that takes as input two values and reduces them to one. \n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 51 + }, + "colab_type": "code", + "id": "0_WvAgyvR7m6", + "outputId": "a89a2324-9d80-4b70-c8f7-ad1ddc8f4956" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DISTRICT,DISTRICT NAME,ADDRESS,CITY,STATE,ZIP,WEBSITE,PHONE,FAX,TTY,X COORDINATE,Y COORDINATE,LATITUDE,LONGITUDE,LOCATION\n", + "1,Central,1718 S State St,Chicago,IL,60616,http://home.chicagopolice.org/community/districts/1st-district-central/,312-745-4290,312-745-3694,312-745-3693,1176569.052,1891771.704,41.85837259,-87.62735617,\"(41.8583725929, -87.627356171)\"\n" + ] + } + ], + "source": [ + "psrdd = sc.textFile('police-station.csv')\n", + "print(psrdd.first())\n", + "ps_header = psrdd.first()\n", + "ps_rest = psrdd.filter(lambda line: line!=ps_header)\n", + "print(ps_rest.first())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "P65eAFO3Mkdd" + }, + "source": [ + "**How many police stations are there?**" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "colab_type": "code", + "id": "Vi03EU0CMSmO", + "outputId": "5c566a1f-86ae-4175-956c-c6b2f6b7d692" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "24" + ] + }, + "execution_count": 53, + "metadata": { + "tags": [] + }, + "output_type": "execute_result" + } + ], + "source": [ + "ps_rest.map(lambda line: line.split(\",\")).count()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "3c4bci70MnlQ" + }, + "source": [ + "**Display the District ID, District name, Address and Zip for the police station with District ID 7**" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "colab_type": "code", + "id": "fWFpo_WxMnvm", + "outputId": "cae005e4-efb8-4bf9-c3fe-caff8a69710c" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[('7', 'Englewood', '1438 W 63rd St', '60636')]" + ] + }, + "execution_count": 54, + "metadata": { + "tags": [] + }, + "output_type": "execute_result" + } + ], + "source": [ + "# District is column 0\n", + "(ps_rest.filter(lambda line: line.split(\",\")[0]=='7').\n", + " map(lambda line: (line.split(\",\")[0],\n", + " line.split(\",\")[1],\n", + " line.split(\",\")[2],\n", + " line.split(\",\")[5])).collect())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "ZYmb5FscMph3" + }, + "source": [ + "**Police stations 10 and 11 are geographically close to each other. Display the District ID, District name, address and zip code**" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 51 + }, + "colab_type": "code", + "id": "6ZcRIX3mMquF", + "outputId": "319981cb-4fae-4945-dc43-100b248144a5" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[('11', 'Harrison', '3151 W Harrison St', '60612'),\n", + " ('10', 'Ogden', '3315 W Ogden Ave', '60623')]" + ] + }, + "execution_count": 55, + "metadata": { + "tags": [] + }, + "output_type": "execute_result" + } + ], + "source": [ + "# District is column 0\n", + "(ps_rest.filter(lambda line: line.split(\",\")[0] in ['10', '11']).\n", + " map(lambda line: (line.split(\",\")[0],\n", + " line.split(\",\")[1],\n", + " line.split(\",\")[2],\n", + " line.split(\",\")[5])).collect())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "3wn2zXe7TbI3" + }, + "source": [] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [ + "qJoeN3e8_Gzk", + "6Ul54hAYyHyd", + "X6zdrH15_CCW", + "Dd6t0uFzuR4X", + "hmIqq6xPK7m7", + "HgwoX-pfNqQI", + "_QwZtWxZRCBn", + "eFoagdqARKb8", + "rsD48rckdHPe", + "WbKK5iHwmIoV", + "xOQPOt19q_he", + "aHjILb1DriuX", + "PIKigra7A34e", + "ldtA0wk9BMkT", + "KQ6Ul9HGCwC3", + "sY6PstyLDp6P", + "7OZElEvcGOD1", + "EEEB2TVqL4Ie", + "x62BiCgBMOtq" + ], + "name": "Colab and PySpark.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.16" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +}