diff --git a/README.md b/README.md index 243c33f..20c429a 100644 --- a/README.md +++ b/README.md @@ -61,7 +61,7 @@ while also allowing for local file editing. 3. `mkdir datasets` 2. You can proceed as required by the project with downloading the data, etc. 3. (Optional) Run docker by attaching volumes for the repo and the dataset: - 3. Interactive: `docker run -v :/workspace/search_engineering -v ~/projects/corise/search_engineering/datasets:/workspace/datasets --network docker_opensearch-net --name search_engineering -it gsingers/search_engineering:latest` + 3. Interactive: `docker run -v :/workspace/search_engineering -v ~/projects/corise/search_engineering/datasets:/workspace/datasets --network opensearch-net --name search_engineering -it gsingers/search_engineering:latest` 4. You can also run natively, but our ability to support you will be limited. If you do, please see the `.gitpod.Dockerfile` for system requirements, Python versions, etc. 5. If you are running locally, be sure to `pip install` the `requirements.txt` file located in the root directory into a virtual environment running Python 3.9.7. Again, see `.gitpod.Dockerfile` if you are not sure. diff --git a/docker/docker-compose-w1.yml b/docker/docker-compose-w1.yml index b3145e2..bafa125 100644 --- a/docker/docker-compose-w1.yml +++ b/docker/docker-compose-w1.yml @@ -33,5 +33,7 @@ services: volumes: search-eng-week1: +# explicitly set name, otherwise the docker network name is prepended with the folder (project) name networks: opensearch-net: + name: opensearch-net diff --git a/docker/docker-compose-w2.yml b/docker/docker-compose-w2.yml index 1a76cad..bc5d7ca 100644 --- a/docker/docker-compose-w2.yml +++ b/docker/docker-compose-w2.yml @@ -33,5 +33,7 @@ services: volumes: search-eng-week2: +# explicitly set name, otherwise the docker network name is prepended with the folder (project) name networks: opensearch-net: + name: opensearch-net \ No newline at end of file diff --git a/docker/docker-compose-w3.yml b/docker/docker-compose-w3.yml index ea82472..d6e1974 100644 --- a/docker/docker-compose-w3.yml +++ b/docker/docker-compose-w3.yml @@ -85,5 +85,7 @@ volumes: search-eng-week3-d2: search-eng-week3-d3: +# explicitly set name, otherwise the docker network name is prepended with the folder (project) name networks: opensearch-net: + name: opensearch-net \ No newline at end of file diff --git a/docker/docker-compose-w4.yml b/docker/docker-compose-w4.yml index 5f8794d..f8ec009 100644 --- a/docker/docker-compose-w4.yml +++ b/docker/docker-compose-w4.yml @@ -61,5 +61,7 @@ volumes: opensearch-data1: opensearch-data2: +# explicitly set name, otherwise the docker network name is prepended with the folder (project) name networks: opensearch-net: + name: opensearch-net \ No newline at end of file diff --git a/install-packages.md b/install-packages.md index 6c7667e..0d8aabf 100644 --- a/install-packages.md +++ b/install-packages.md @@ -1,4 +1,4 @@ -# These are some helpful packages you may want to install in the OpenSearch containers. +These are some helpful packages you may want to install in the OpenSearch containers. We purposefully don't install them for you so that we don't have to ship and maintain a custom package for this class. To begin, you'll need to attach to the running OpenSearch instance as root: diff --git a/requirements.txt b/requirements.txt index 730c015..3299fed 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,4 +5,5 @@ urljoin pandas fastparquet click -lxml \ No newline at end of file +lxml +kaggle \ No newline at end of file diff --git a/week1/index.py b/week1/index.py index b69f66e..f217581 100644 --- a/week1/index.py +++ b/week1/index.py @@ -29,6 +29,58 @@ "type":"type/text()", "shortDescription": "shortDescription/text()", "startDate": "startDate/text()", +"active": "active/text()", +"regularPrice": "regularPrice/text()", +"salePrice": "salePrice/text()", +"shortDescription": "shortDescription/text()", +"shortDescriptionHtml": "shortDescriptionHtml/text()", +"longDescription": "longDescription/text()", +"longDescriptionHtml": "longDescriptionHtml/text()", +"artistName": "artistName/text()", +"onSale": "onSale/text()", +"digital": "digital/text()", +"frequentlyPurchasedWith": "frequentlyPurchasedWith/*/text()", # Note the match all here to get the subfields +"accessories": "accessories/*/text()" , # Note the match all here to get the subfields +"relatedProducts": "relatedProducts/*/text()", # Note the match all here to get the subfields +"crossSell": "crossSell/text()", +"salesRankShortTerm": "salesRankShortTerm/text()", +"salesRankMediumTerm": "salesRankMediumTerm/text()", +"salesRankLongTerm": "salesRankLongTerm/text()", +"bestSellingRank": "bestSellingRank/text()", +"url": "url/text()", +"categoryPath": "categoryPath/*/name/text()", # Note the match all here to get the subfields +"categoryPathIds": "categoryPath/*/id/text()", # Note the match all here to get the subfields +"categoryLeaf": "categoryPath/category[last()]/id/text()", +"categoryPathCount": "count(categoryPath/*/name)", +"customerReviewCount": "customerReviewCount/text()", +"customerReviewAverage": "customerReviewAverage/text()", +"inStoreAvailability": "inStoreAvailability/text()", +"onlineAvailability": "onlineAvailability/text()", +"releaseDate": "releaseDate/text()", +"shippingCost": "shippingCost/text()", +"class": "class/text()", +"classId": "classId/text()", +"subclass": "subclass/text()", +"subclassId": "subclassId/text()", +"department": "department/text()", +"departmentId": "departmentId/text()", +"bestBuyItemId": "bestBuyItemId/text()", +"description": "description/text()", +"manufacturer": "manufacturer/text()", +"modelNumber": "modelNumber/text()", +"image": "image/text()", +"condition": "condition/text()", +"inStorePickup": "inStorePickup/text()", +"homeDelivery": "homeDelivery/text()", +"quantityLimit": "quantityLimit/text()", +"color": "color/text()", +"depth": "depth/text()", +"height": "height/text()", +"weight": "weight/text()", +"shippingWeight": "shippingWeight/text()", +"width": "width/text()", +"features": "features/*/text()" # Note the match all here to get the subfields + } ''' "startDate": "startDate/text()", @@ -159,11 +211,11 @@ def main(source_dir: str, file_glob: str, index_name: str, workers: int, host: s f"Indexing {source_dir} to {index_name} with {workers} workers, refresh_interval of {refresh_interval} to host {host} with a maximum number of docs sent per file per worker of {max_docs} and {batch_size} per batch.") files = glob.glob(source_dir + "/" + file_glob) docs_indexed = 0 - # Set refresh interval to -1 - client = get_opensearch(host) - # Set index refresh interval here before indexing + client = get_opensearch(host) + #TODO: set the refresh interval + client.indices.put_settings(index = index_name, body= refresh_settings) logger.debug(client.indices.get_settings(index=index_name)) start = perf_counter() time_indexing = 0 @@ -176,14 +228,8 @@ def main(source_dir: str, file_glob: str, index_name: str, workers: int, host: s finish = perf_counter() logger.info(f'Done. {docs_indexed} were indexed in {(finish - start)/60} minutes. Total accumulated time spent in `bulk` indexing: {time_indexing/60} minutes') - # set refresh interval back to 5s - refresh_settings = { - 'settings': { - 'index': { - 'refresh_interval': "5s" - } - } - } + # TODO set refresh interval back to 5s + client.indices.put_settings(index = index_name, body= refresh_settings) logger.debug(client.indices.get_settings(index=index_name))