Skip to content
This repository has been archived by the owner on Oct 16, 2024. It is now read-only.

Update to debian bookworm #2214

Draft
wants to merge 1 commit into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 7 additions & 8 deletions capstone/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
FROM python:3.7-buster
FROM python:3.11.9-bookworm
ENV PYTHONUNBUFFERED 1

# Enable apt-get -t buster-backports
RUN echo 'deb http://deb.debian.org/debian buster-backports main' > /etc/apt/sources.list.d/backports.list
#RUN echo 'deb http://deb.debian.org/debian buster-backports main' > /etc/apt/sources.list.d/backports.list

# Get build dependencies and packages required by the app
# FIRST LINE:
Expand All @@ -22,9 +22,9 @@ RUN echo 'deb http://deb.debian.org/debian buster-backports main' > /etc/apt/sou
# htmltidy for fastcase ingest
RUN apt-get update \
&& apt-get install -y redis-server postgresql-client libtiff-tools pdftk \
&& apt-get install -y librocksdb5.17 librocksdb-dev libsnappy-dev zlib1g-dev libbz2-dev libgflags-dev liblz4-dev rocksdb-tools \
&& apt-get install -y librocksdb7.8 librocksdb-dev libsnappy-dev zlib1g-dev libbz2-dev libgflags-dev liblz4-dev rocksdb-tools \
&& apt-get install -y libx11-xcb1 libxtst6 libgtk-3-0 libnss3 \
&& echo libhyperscan5 libhyperscan/cpu-ssse3 boolean true | debconf-set-selections && apt-get -t buster-backports install -y libhyperscan-dev \
&& echo libhyperscan5 libhyperscan/cpu-ssse3 boolean true | debconf-set-selections && apt-get install -y libhyperscan-dev \
&& apt-get install -y tidy \
&& apt-get install -y libdrm2 libgbm-dev \
&& apt-get install --no-install-recommends libdbus-glib-1-2
Expand All @@ -33,7 +33,7 @@ RUN apt-get update \
RUN mkdir /app
WORKDIR /app
COPY requirements.txt /app
RUN pip install pip==21.3.1 \
RUN pip install pip==24.0 \
&& pip install -r requirements.txt --src /usr/local/src \
&& rm requirements.txt

Expand All @@ -44,9 +44,8 @@ RUN echo "--modules-folder /node_modules" > /.yarnrc
COPY package.json /app
COPY yarn.lock /app
# pin node version -- see https://github.com/nodesource/distributions/issues/33
RUN curl -o nodejs.deb https://deb.nodesource.com/node_14.x/pool/main/n/nodejs/nodejs_14.21.3-1nodesource1_amd64.deb \
&& dpkg -i ./nodejs.deb \
&& rm nodejs.deb \
RUN curl -fsSL https://deb.nodesource.com/setup_20.x | bash - \
&& apt-get install -y nodejs=20.12.0-1nodesource1 \
&& npm install -g [email protected] \
&& yarn install --frozen-lockfile \
&& rm package.json \
Expand Down
24 changes: 1 addition & 23 deletions capstone/capapi/api_urls.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
from django.conf import settings
from django.urls import path, re_path, include
from django.views.generic import RedirectView, TemplateView
from rest_framework import routers, permissions
from drf_yasg.views import get_schema_view
from drf_yasg import openapi
from rest_framework import routers

from capapi.views import api_views

Expand All @@ -27,32 +25,12 @@ def get(self, request, *args, **kwargs):
return super().get(request, *args, **kwargs)
router.APIRootView = FilteredAPIRootView


schema_view = get_schema_view(
openapi.Info(
title="CAP API",
default_version='v1',
description="United States Caselaw",
terms_of_service="https://%s/terms" % settings.PARENT_HOST,
contact=openapi.Contact(url="https://%s/contact" % settings.PARENT_HOST),
),
urlconf='capapi.api_urls',
validators=['flex', 'ssv'],
public=True,
permission_classes=(permissions.AllowAny,),
)

urlpatterns = [
path('v1/', include(router.urls)),
path('unstable/', include(unstable_router.urls)),
# convenience pattern: catch all citations, redirect in CaseDocumentViewSet's retrieve
re_path(r'^v1/cases/(?P<id>[0-9A-Za-z\s\.]+)/$', api_views.CaseDocumentViewSet.as_view({'get': 'retrieve'}), name='case-get-cite'),

### Swagger/OpenAPI/ReDoc ###
re_path(r'^swagger(?P<format>\.json|\.yaml)$', schema_view.without_ui(cache_timeout=None), name='schema-json'),
re_path(r'^swagger/$', schema_view.with_ui('swagger', cache_timeout=None), name='schema-swagger-ui'),
re_path(r'^redoc/$', schema_view.with_ui('redoc', cache_timeout=None), name='schema-redoc'),

path('robots.txt', TemplateView.as_view(template_name='robots_api.txt', content_type='text/plain'), name='robots_api'),
path('', RedirectView.as_view(url='/v1/', permanent=False), name='api-root')
]
Expand Down
30 changes: 1 addition & 29 deletions capstone/capapi/tests/test_api.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from csv import DictReader
from io import StringIO

from flaky import flaky
from rest_framework.request import Request
from rest_framework.test import APIRequestFactory

Expand Down Expand Up @@ -583,10 +582,8 @@ def test_filter_reporter(client, reporter):

# NGRAMS

@flaky(max_runs=10) # ngrammed_cases call to ngram_jurisdictions doesn't reliably work because it uses multiprocessing within pytest environment
@pytest.mark.django_db(databases=['capdb'])
def test_ngrams_api(client, request):
ngrammed_cases = request.getfixturevalue('ngrammed_cases') # load fixture inside test so flaky() can catch errors
def test_ngrams_api(client, ngrammed_cases):

# check result counts when not filtering by jurisdiction
json = client.get(api_reverse('ngrams-list'), {'q': 'one two'}).json()
Expand All @@ -600,31 +597,6 @@ def test_ngrams_api(client, request):
'one two': {
'jur1': [{'year': '2000', 'count': [1, 6], 'doc_count': [1, 2]}]}}

# check wildcard match
json = client.get(api_reverse('ngrams-list'), {'q': 'three *'}).json()
assert json['results'] == {
'three four': {
'total': [{'year': '2000', 'count': [1, 9], 'doc_count': [1, 3]}]},
"three don't": {
'total': [{'year': '2000', 'count': [2, 9], 'doc_count': [2, 3]}]}}


# API SPECIFICATION ENDPOINTS
@pytest.mark.django_db(databases=['capdb'])
@pytest.mark.parametrize("url, content_type", [
(api_reverse("schema-swagger-ui"), 'text/html'),
(api_reverse("schema-json", args=['.json']), 'application/json'),
(api_reverse("schema-json", args=['.yaml']), 'application/yaml'),
])
def test_swagger(client, url, content_type):
response = client.get(url)
check_response(response, content_type=content_type)


def test_redoc(client):
response = client.get(api_reverse("schema-redoc"))
check_response(response, content_type="text/html")


# PAGINATION
@pytest.mark.django_db(databases=['capdb'])
Expand Down
184 changes: 23 additions & 161 deletions capstone/capapi/views/api_views.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
import bisect
import urllib
import re
from datetime import datetime
from collections import defaultdict
from pathlib import Path

from django.utils.functional import partition
from django_filters.utils import translate_validation
Expand All @@ -25,7 +22,6 @@
from capapi.resources import api_request
from capdb import models
from capdb.models import CaseMetadata
from capdb.storages import ngram_kv_store_ro
from capweb.helpers import cache_func
from scripts.helpers import alphanum_lower
from user_data.models import UserHistory
Expand Down Expand Up @@ -232,10 +228,11 @@ class CaseDocumentViewSet(BaseDocumentViewSet):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.valid_query_fields = [
*[field.name for backend in self.query_filter_backends
for field in backend().get_schema_fields(self)],
*[backend.search_param for backend in self.query_filter_backends
if hasattr(backend, 'search_param')]
# get_schema_fields() doesn't work in python 3.11, so this would need another approach if we want it:
# *[field.name for backend in self.query_filter_backends
# for field in backend().get_schema_fields(self)],
# *[backend.search_param for backend in self.query_filter_backends
# if hasattr(backend, 'search_param')]
]

def is_full_case_request(self):
Expand Down Expand Up @@ -414,27 +411,6 @@ def __init__(self, *args, **kwargs):
self.jurisdiction_id_to_slug = dict(models.Jurisdiction.objects.values_list('pk', 'slug'))
self.jurisdiction_id_to_slug[None] = 'total'
self.jurisdiction_slug_to_id = {v:k for k,v in self.jurisdiction_id_to_slug.items()}
self.totals_by_jurisdiction_year_length = self.load_totals()

@staticmethod
def load_totals():
# populate self.totals_by_jurisdiction_year_length, a mapping of jurisdiction-year-length to counts, like:
# {
# (<jur_id>, <year>, <length>): (<word count>, <document count>),
# }
if not Path(ngram_kv_store_ro.db_path()).exists():
return {}
totals_by_jurisdiction_year_length = defaultdict(lambda: [0,0])
for k, v in ngram_kv_store_ro.get_prefix(b'totals', packed=True):
jur, year, n = ngram_kv_store_ro.unpack(k[len(b'totals'):])
totals_by_jurisdiction_year_length[(jur, year, n)] = v
for total in (
totals_by_jurisdiction_year_length[(None, year, n)],
totals_by_jurisdiction_year_length[(None, None, n)]
):
total[0] += v[0]
total[1] += v[1]
return totals_by_jurisdiction_year_length

@staticmethod
def query_params_are_filters(query_body):
Expand All @@ -455,27 +431,6 @@ def query_params_are_filters(query_body):

return True


def get_query_data_from_api_query(self, q):
# given an `api(...)` query, return a structured list of filters and aggregations
# validate whether a case ID exists in the corpus
# check if the supplied item is a valid case id
if not q or not (q.startswith('api(') and q.endswith(')')):
return False

query_body = None
try:
query_body = QueryDict(q[4:-1], mutable=True)
except Exception:
raise ValidationError({'error': 'Query is not in a URL parameter format.'})

self.query_params_are_filters(query_body)

query_body['page_size'] = 1
query_body['facet'] = 'decision_date'

return query_body

@staticmethod
def create_timeline_entries(bucket_entries, total_dict, jurisdiction):
# generate timeline datapoint given an elasticsearch query result
Expand Down Expand Up @@ -561,117 +516,24 @@ def list(self, request, *args, **kwargs):
if not q:
return Response({})

# check if we're querying for a case as opposed to a word
# default to keyword search if value is empty
api_query_body = self.get_query_data_from_api_query(q)

# prepend word count as first byte. only applicable for n-grams
words = q.lower().split(' ')[:3] # use first 3 words
q_len = len(words)
q_sig = bytes([q_len]) + ' '.join(words).encode('utf8')

if api_query_body:
try:
results = self.get_citation_data(request, api_query_body, q)
except filters.TooManyJoinedResultsException:
raise ValidationError({'error': 'The set of cases to cite to is too large. Consider \
narrowing this group to contain less than 20000 cases.'})
pairs = []
elif q_sig.endswith(b' *'):
results = {}
# wildcard search
pairs = ngram_kv_store_ro.get_prefix(q_sig[:-1], packed=True)
else:
results = {}
# non-wildcard search
value = ngram_kv_store_ro.get(q_sig, packed=True)
if value:
pairs = [(q_sig, value)]
else:
pairs = []

## format results
if pairs:
# prepare jurisdiction_filter from jurisdiction= query param
jurisdictions = request.GET.getlist('jurisdiction')
if '*' in jurisdictions:
jurisdiction_filter = None
else:
jurisdiction_filter = set(self.jurisdiction_slug_to_id[j] for j in jurisdictions if j in self.jurisdiction_slug_to_id)
if not jurisdiction_filter:
jurisdiction_filter.add(None)

# prepare year_filter from year= query param
year_filter = set()
for year in request.GET.getlist('year'):
if year.isdigit():
year_filter.add(int(year))

# get top 10 pairs
top_pairs = []
for gram, data in pairs:
total_jur = data[None]
sort_count = total_jur[None][0]
bisect.insort_right(top_pairs, (sort_count, gram, data))
top_pairs = top_pairs[-10:]

# Reformat stored gram data for delivery.
# top_pairs will look like:
# [
# (<sort_count>, b'<wordcount><gram>', {
# <jur_id>: [
# <year - 1900>, <instance_count>, <document_count>,
# <year - 1900>, <instance_count>, <document_count>, ...
# ]),
# ]
# this reformats to:
# {
# <jurisdiction slug>: [
# {
# 'year': <year>,
# 'count': [<instance_count>, <total instances>],
# 'doc_count': [<instance_count>, <total instances>],
# }
# ]
# }
for _, gram, data in reversed(top_pairs):
out = {}
for jur_id, years in data.items():

# apply jurisdiction_filter
if jurisdiction_filter and jur_id not in jurisdiction_filter:
continue

years_out = []
jur_slug = self.jurisdiction_id_to_slug[jur_id]
if jur_id is None:
years = [i for k, v in years.items() for i in [k]+v]
for i in range(0, len(years), 3):
year, count, doc_count = years[i:i+3]

# filter out total
if year is None:
continue

# years will be -1900 for msgpack compression -- add 1900 back in
year += 1900

# apply year filter
if year_filter and year not in year_filter:
continue

totals = self.totals_by_jurisdiction_year_length[(jur_id, year, q_len)]
years_out.append({
"year": str(year) if year else "total",
"count": [count, totals[0]],
"doc_count": [doc_count, totals[1]]
})

years_out.sort(key=lambda y: y["year"])
out[jur_slug] = years_out

if out:
results[gram[1:].decode('utf8')] = out
if q.startswith('api(') and q.endswith(')'):
q = q[4:-1]

try:
api_query_body = QueryDict(q, mutable=True)
except Exception:
raise ValidationError({'error': 'Query is not in a URL parameter format.'})

self.query_params_are_filters(api_query_body)

api_query_body['page_size'] = 1
api_query_body['facet'] = 'decision_date'

try:
results = self.get_citation_data(request, api_query_body, q)
except filters.TooManyJoinedResultsException:
raise ValidationError({'error': 'The set of cases to cite to is too large. Consider \
narrowing this group to contain less than 20000 cases.'})

paginated = {
"count": len(results),
Expand Down
2 changes: 1 addition & 1 deletion capstone/capdb/management/commands/run_celery_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

# Create a command to autoreload celery beat for elasticsearch indexing
def autoreload_celery(*args, **kwargs):
celery_worker_cmd = "celery worker -A config.celery.app -c 1 -B --uid=nobody --gid=nogroup"
celery_worker_cmd = "celery -A config.celery.app -c 1 -B --uid=nobody --gid=nogroup worker"
print("Kill lingering celery worker...")
subprocess.run(shlex.split(f'pkill -f "{celery_worker_cmd}"'))
print("Start celery worker...")
Expand Down
Loading
Loading