-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
de65f57
commit b57b1f4
Showing
27 changed files
with
1,370 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
# Python | ||
__pycache__ | ||
*.pyc | ||
*.pyi | ||
venv | ||
.cache | ||
notebooks/* | ||
tests/* | ||
|
||
# IDE | ||
.idea | ||
|
||
# Logs | ||
*.log | ||
|
||
# Build | ||
dist | ||
*.egg-info | ||
|
||
# Git | ||
.git/ | ||
.github | ||
|
||
# Documentation | ||
doc |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
FROM ubuntu:18.04 | ||
|
||
RUN apt-get update \ | ||
&& apt-get install -y wget \ | ||
gnupg2 | ||
|
||
RUN wget -qO - https://www.mongodb.org/static/pgp/server-3.4.asc | apt-key add - | ||
|
||
RUN echo "deb [ arch=amd64,arm64 ] http://repo.mongodb.org/apt/ubuntu xenial/mongodb-org/3.4 multiverse" | tee /etc/apt/sources.list.d/mongodb-org-3.4.list | ||
|
||
RUN apt-get update && apt-get install -y --no-install-recommends \ | ||
python3.6 \ | ||
python3-pip \ | ||
libpython3.6 \ | ||
jq \ | ||
mongodb-org \ | ||
locales \ | ||
locales-all \ | ||
python3-setuptools \ | ||
g++ \ | ||
python3-dev \ | ||
npm \ | ||
curl \ | ||
&& \ | ||
apt-get clean && \ | ||
rm -rf /var/lib/apt/lists/* | ||
|
||
WORKDIR /src | ||
|
||
ENV LC_ALL en_US.UTF-8 | ||
ENV LANG en_US.UTF-8 | ||
ENV LANGUAGE en_US.UTF-8 | ||
|
||
COPY requirements.txt /src/requirements.txt | ||
RUN pip3 install --upgrade pip | ||
RUN pip3 install -r requirements.txt --proxy=${HTTP_PROXY} | ||
|
||
COPY . /src |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
DOCKER_IMAGE_NAME=dataesr/scientific-tagger | ||
CURRENT_VERSION=$(shell cat project/__init__.py | cut -d "'" -f 2) | ||
|
||
start: | ||
@echo Scientific Tagger starting... | ||
docker-compose up -d | ||
@echo Scientific Tagger started http://localhost:5004 | ||
|
||
stop: | ||
@echo Matcher stopping... | ||
docker-compose down | ||
@echo Matcher stopped | ||
|
||
release: | ||
echo "__version__ = '$(VERSION)'" > project/__init__.py | ||
git commit -am '[release] version $(VERSION)' | ||
git tag $(VERSION) | ||
@echo If everything is OK, you can push with tags i.e. git push origin main --tags | ||
|
||
docker-build: | ||
@echo Building a new docker image | ||
docker build -t $(DOCKER_IMAGE_NAME):$(CURRENT_VERSION) -t $(DOCKER_IMAGE_NAME):latest . | ||
@echo Docker image built | ||
|
||
docker-push: | ||
@echo Pushing a new docker image | ||
docker push $(DOCKER_IMAGE_NAME):$(CURRENT_VERSION) | ||
docker push $(DOCKER_IMAGE_NAME):latest | ||
@echo Docker image pushed | ||
|
||
python-build: | ||
@echo Building a python package | ||
python setup.py sdist | ||
@echo Python package built |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,2 @@ | ||
# harvest-hal | ||
# Download pdf | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
sudo docker build -f Dockerfile -t dataesr/harvest-hal -t dataesr/harvest-hal:0.0.0 . | ||
sudo docker push -a dataesr/harvest-hal |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
import redis | ||
from rq import Connection, Worker | ||
from flask.cli import FlaskGroup | ||
|
||
from project.server import create_app | ||
from werkzeug.serving import WSGIRequestHandler | ||
|
||
app = create_app() | ||
cli = FlaskGroup(create_app=create_app) | ||
|
||
|
||
@cli.command() | ||
def test(): | ||
"""Runs the unit tests without test coverage.""" | ||
tests = unittest.TestLoader().discover("project/tests", pattern="test*.py") | ||
result = unittest.TextTestRunner(verbosity=2).run(tests) | ||
if result.wasSuccessful(): | ||
return 0 | ||
return 1 | ||
|
||
|
||
@cli.command("run_worker") | ||
def run_worker(): | ||
redis_url = app.config["REDIS_URL"] | ||
redis_connection = redis.from_url(redis_url) | ||
with Connection(redis_connection): | ||
worker = Worker(app.config["QUEUES"]) | ||
worker.work() | ||
|
||
|
||
if __name__ == "__main__": | ||
WSGIRequestHandler.protocol_version = "HTTP/1.1" | ||
cli() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,218 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 77, | ||
"id": "fd46ad3a", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import requests\n", | ||
"from urllib.parse import quote_plus" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "5cde00fd", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"year = 2000\n", | ||
"nb_rows = 100\n", | ||
"cursor='*'\n", | ||
"data = []\n", | ||
"while True:\n", | ||
" url = f'https://api.archives-ouvertes.fr/search/?q=*:*&wt=json&fl=*&fq=submittedDateY_i:[{year}%20TO%20{year}]&sort=docid asc&rows={nb_rows}&cursorMark={cursor}'\n", | ||
" r = requests.get(url)\n", | ||
" res = r.json()\n", | ||
" new_cursor = quote_plus(res['nextCursorMark'])\n", | ||
" data += res['response']['docs']\n", | ||
" if new_cursor == cursor:\n", | ||
" break\n", | ||
" cursor = new_cursor" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 83, | ||
"id": "53d07165", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"1675" | ||
] | ||
}, | ||
"execution_count": 83, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"len(data)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 66, | ||
"id": "6353259d", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"{'error': {'metadata': ['error-class',\n", | ||
" 'org.apache.solr.common.SolrException',\n", | ||
" 'root-error-class',\n", | ||
" 'java.lang.IllegalArgumentException'],\n", | ||
" 'msg': \"Unable to parse 'cursorMark' after totem: value must either be '*' or the 'nextCursorMark' returned by a previous search: AoFa12c\",\n", | ||
" 'code': 400}}" | ||
] | ||
}, | ||
"execution_count": 66, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"r.json()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 62, | ||
"id": "a83e23c4", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"'https://api.archives-ouvertes.fr/search/?q=*:*&wt=json&fl=*&fq=submittedDateY_i:[1950%20TO%201950]&sort=docid asc&rows=1&cursorMark=AoFQrnI='" | ||
] | ||
}, | ||
"execution_count": 62, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"url" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 30, | ||
"id": "31dfec4b", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"dict_keys(['numFound', 'start', 'docs'])" | ||
] | ||
}, | ||
"execution_count": 30, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"r.json()['response'].keys()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 35, | ||
"id": "fa429c97", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"dict_keys(['response', 'nextCursorMark'])" | ||
] | ||
}, | ||
"execution_count": 35, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"r.json().keys()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 49, | ||
"id": "8f368b0a", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"160" | ||
] | ||
}, | ||
"execution_count": 49, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"r.json()['response']['numFound']" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 54, | ||
"id": "295c75e3", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"'AoFQrnI='" | ||
] | ||
}, | ||
"execution_count": 54, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"r.json()['nextCursorMark']" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "58daec78", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.9.1" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
Oops, something went wrong.