Skip to content

Commit

Permalink
fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
gsingers committed Apr 14, 2023
1 parent 05a87dd commit 14e1bf5
Show file tree
Hide file tree
Showing 8 changed files with 69 additions and 14 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ while also allowing for local file editing.
3. `mkdir datasets`
2. You can proceed as required by the project with downloading the data, etc.
3. (Optional) Run docker by attaching volumes for the repo and the dataset:
3. Interactive: `docker run -v <PATH TO WHERE YOU CLONED THIS REPO>:/workspace/search_engineering -v ~/projects/corise/search_engineering/datasets:/workspace/datasets --network docker_opensearch-net --name search_engineering -it gsingers/search_engineering:latest`
3. Interactive: `docker run -v <PATH TO WHERE YOU CLONED THIS REPO>:/workspace/search_engineering -v ~/projects/corise/search_engineering/datasets:/workspace/datasets --network opensearch-net --name search_engineering -it gsingers/search_engineering:latest`
4. You can also run natively, but our ability to support you will be limited. If you do, please see the `.gitpod.Dockerfile` for system requirements, Python versions, etc.
5. If you are running locally, be sure to `pip install` the `requirements.txt` file located in the root directory into a virtual environment running Python 3.9.7. Again, see `.gitpod.Dockerfile` if you are not sure.

Expand Down
2 changes: 2 additions & 0 deletions docker/docker-compose-w1.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,5 +33,7 @@ services:
volumes:
search-eng-week1:

# explicitly set name, otherwise the docker network name is prepended with the folder (project) name
networks:
opensearch-net:
name: opensearch-net
2 changes: 2 additions & 0 deletions docker/docker-compose-w2.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,5 +33,7 @@ services:
volumes:
search-eng-week2:

# explicitly set name, otherwise the docker network name is prepended with the folder (project) name
networks:
opensearch-net:
name: opensearch-net
2 changes: 2 additions & 0 deletions docker/docker-compose-w3.yml
Original file line number Diff line number Diff line change
Expand Up @@ -85,5 +85,7 @@ volumes:
search-eng-week3-d2:
search-eng-week3-d3:

# explicitly set name, otherwise the docker network name is prepended with the folder (project) name
networks:
opensearch-net:
name: opensearch-net
2 changes: 2 additions & 0 deletions docker/docker-compose-w4.yml
Original file line number Diff line number Diff line change
Expand Up @@ -61,5 +61,7 @@ volumes:
opensearch-data1:
opensearch-data2:

# explicitly set name, otherwise the docker network name is prepended with the folder (project) name
networks:
opensearch-net:
name: opensearch-net
2 changes: 1 addition & 1 deletion install-packages.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# These are some helpful packages you may want to install in the OpenSearch containers.
These are some helpful packages you may want to install in the OpenSearch containers. We purposefully don't install them for you so that we don't have to ship and maintain a custom package for this class.

To begin, you'll need to attach to the running OpenSearch instance as root:

Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@ urljoin
pandas
fastparquet
click
lxml
lxml
kaggle
68 changes: 57 additions & 11 deletions week1/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,58 @@
"type":"type/text()",
"shortDescription": "shortDescription/text()",
"startDate": "startDate/text()",
"active": "active/text()",
"regularPrice": "regularPrice/text()",
"salePrice": "salePrice/text()",
"shortDescription": "shortDescription/text()",
"shortDescriptionHtml": "shortDescriptionHtml/text()",
"longDescription": "longDescription/text()",
"longDescriptionHtml": "longDescriptionHtml/text()",
"artistName": "artistName/text()",
"onSale": "onSale/text()",
"digital": "digital/text()",
"frequentlyPurchasedWith": "frequentlyPurchasedWith/*/text()", # Note the match all here to get the subfields
"accessories": "accessories/*/text()" , # Note the match all here to get the subfields
"relatedProducts": "relatedProducts/*/text()", # Note the match all here to get the subfields
"crossSell": "crossSell/text()",
"salesRankShortTerm": "salesRankShortTerm/text()",
"salesRankMediumTerm": "salesRankMediumTerm/text()",
"salesRankLongTerm": "salesRankLongTerm/text()",
"bestSellingRank": "bestSellingRank/text()",
"url": "url/text()",
"categoryPath": "categoryPath/*/name/text()", # Note the match all here to get the subfields
"categoryPathIds": "categoryPath/*/id/text()", # Note the match all here to get the subfields
"categoryLeaf": "categoryPath/category[last()]/id/text()",
"categoryPathCount": "count(categoryPath/*/name)",
"customerReviewCount": "customerReviewCount/text()",
"customerReviewAverage": "customerReviewAverage/text()",
"inStoreAvailability": "inStoreAvailability/text()",
"onlineAvailability": "onlineAvailability/text()",
"releaseDate": "releaseDate/text()",
"shippingCost": "shippingCost/text()",
"class": "class/text()",
"classId": "classId/text()",
"subclass": "subclass/text()",
"subclassId": "subclassId/text()",
"department": "department/text()",
"departmentId": "departmentId/text()",
"bestBuyItemId": "bestBuyItemId/text()",
"description": "description/text()",
"manufacturer": "manufacturer/text()",
"modelNumber": "modelNumber/text()",
"image": "image/text()",
"condition": "condition/text()",
"inStorePickup": "inStorePickup/text()",
"homeDelivery": "homeDelivery/text()",
"quantityLimit": "quantityLimit/text()",
"color": "color/text()",
"depth": "depth/text()",
"height": "height/text()",
"weight": "weight/text()",
"shippingWeight": "shippingWeight/text()",
"width": "width/text()",
"features": "features/*/text()" # Note the match all here to get the subfields

}
'''
"startDate": "startDate/text()",
Expand Down Expand Up @@ -159,11 +211,11 @@ def main(source_dir: str, file_glob: str, index_name: str, workers: int, host: s
f"Indexing {source_dir} to {index_name} with {workers} workers, refresh_interval of {refresh_interval} to host {host} with a maximum number of docs sent per file per worker of {max_docs} and {batch_size} per batch.")
files = glob.glob(source_dir + "/" + file_glob)
docs_indexed = 0
# Set refresh interval to -1
client = get_opensearch(host)

# Set index refresh interval here before indexing
client = get_opensearch(host)

#TODO: set the refresh interval
client.indices.put_settings(index = index_name, body= refresh_settings)
logger.debug(client.indices.get_settings(index=index_name))
start = perf_counter()
time_indexing = 0
Expand All @@ -176,14 +228,8 @@ def main(source_dir: str, file_glob: str, index_name: str, workers: int, host: s

finish = perf_counter()
logger.info(f'Done. {docs_indexed} were indexed in {(finish - start)/60} minutes. Total accumulated time spent in `bulk` indexing: {time_indexing/60} minutes')
# set refresh interval back to 5s
refresh_settings = {
'settings': {
'index': {
'refresh_interval': "5s"
}
}
}
# TODO set refresh interval back to 5s

client.indices.put_settings(index = index_name, body= refresh_settings)
logger.debug(client.indices.get_settings(index=index_name))

Expand Down

0 comments on commit 14e1bf5

Please sign in to comment.