Skip to content

Commit

Permalink
Merge pull request #294 from GSA/feature/99_remove_duplicates_opp
Browse files Browse the repository at this point in the history
Remove duplicate solicitation numbers in Opportunity Gathering
  • Loading branch information
BuckinghamAJ authored Jan 3, 2024
2 parents 45a6810 + 3ec3f64 commit 6823e00
Show file tree
Hide file tree
Showing 10 changed files with 105 additions and 59 deletions.
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ install_requires =
imbalanced-learn
nltk
numpy
pandas
psycopg2-binary
python-dateutil
python-json-logger
Expand Down
22 changes: 13 additions & 9 deletions src/fbo_scraper/db/db_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -462,12 +462,8 @@ def insert_data_into_solicitations_table(session, data):
)
).lower()

if not sol_existed_in_db:
logger.info("Inserting {}".format(sol.solNum))
session.add(sol)
else:
#print("Updating {}".format(sol.solNum))
logger.info("Updating {}".format(sol.solNum))

insert_data_into(session, sol, sol_existed_in_db)

opp_count += 1

Expand All @@ -486,6 +482,13 @@ def insert_data_into_solicitations_table(session, data):
)
)

def insert_data_into(db_session, from_sol_model, existed_in_db):
if not existed_in_db:
logger.info("Inserting {}".format(from_sol_model.solNum))
db_session.add(from_sol_model)
else:
#print("Updating {}".format(sol.solNum))
logger.info("Updating {}".format(from_sol_model.solNum))

def get_validation_count(session):
"""
Expand Down Expand Up @@ -577,7 +580,7 @@ def fetch_solicitations_by_solnbr(solnbr: str, session, as_dict: bool=True) -> U
solicitation = session.query(db.Solicitation).filter(db.Solicitation.solNum == solnbr).first()

if as_dict:
sol_dict = object_as_dict(solicitation)
sol_dict = object_as_dict(solicitation) if solicitation else None
else:
sol_dict = solicitation

Expand All @@ -598,7 +601,7 @@ def fetch_notice_by_id(notice_id, session):
notice = session.query(db.Notice).get(notice_id)
except AttributeError:
return
notice_dict = object_as_dict(notice)
notice_dict = object_as_dict(notice) if notice else None

return notice_dict

Expand Down Expand Up @@ -676,7 +679,8 @@ def fetch_notice_attachments(notice_id, session):
attachments = session.query(db.Attachment).filter(
db.Attachment.notice_id == notice_id
)
attachment_dicts = [object_as_dict(a) for a in attachments]

attachment_dicts = [object_as_dict(a) for a in attachments] if attachments else []

return attachment_dicts

Expand Down
7 changes: 7 additions & 0 deletions src/fbo_scraper/get_opps.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import urllib
import errno
from pathlib import Path
import pandas as pd

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from fbo_scraper.get_doc_text import get_doc_text
Expand Down Expand Up @@ -299,6 +300,12 @@ def transform_opps(opps, out_path, skip_attachments=False):
]
schematized_opp["attachments"].extend(attachment_data)
transformed_opps.append(schematized_opp)

# Removing duplicate solicitation numbers resulting in a unique solNum constraint violation
df = pd.DataFrame(transformed_opps)
df.drop_duplicates(subset=['solnbr'], inplace=True)
transformed_opps = df.to_dict('records')

return transformed_opps


Expand Down
14 changes: 8 additions & 6 deletions src/fbo_scraper/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,34 +149,36 @@ def main(

logger.info("Smartie is fetching opportunties from SAM...")

data = get_opps.main(
opps_data = get_opps.main(
limit,
opportunity_filter_function=opportunity_filter_function,
target_sol_types=target_sol_types,
skip_attachments=skip_attachments,
from_date=from_date,
to_date=to_date,
)
if not data:
if not opps_data:
logger.info("Smartie didn't find any opportunities!")
else:
logger.info("Smartie is done fetching opportunties from SAM!")

logger.info("Smartie is making predictions for each notice attachment...")

data = predict.insert_predictions(data)
predict_data = predict.insert_predictions(opps_data)
logger.info(
"Smartie is done making predictions for each notice attachment!"
)

with dal.Session.begin() as session:
if data:
if predict_data:
# insert_data(session, data)
logger.info("Smartie is inserting data into the database...")
insert_data_into_solicitations_table(session, data)
insert_data_into_solicitations_table(session, predict_data)
logger.info("Smartie is done inserting data into database!")
else:
logger.error("No data to insert. Something went wrong.")
if opps_data and not predict_data:
# We received opps data but no predictions. This is a problem.
logger.error("No predicition data to insert. Something went wrong.")

if updateOld:
update_old_solicitations(session, max_tests=10)
Expand Down
31 changes: 31 additions & 0 deletions tests/mock_opps.py
Original file line number Diff line number Diff line change
Expand Up @@ -437,6 +437,37 @@
"emails": ["[email protected]"],
}

mock_schematized_solnum_constraint_error = [
{
"notice type": "Special Notice",
"solnbr": "ATC1234",
"agency": "DEPT OF DEFENSE",
"compliant": 0,
"office": "DEPT OF THE AIR FORCE",
"attachments": [],
"classcod": None,
"naics": "5",
"subject": "Gartner Licenses",
"url": "https://beta.sam.gov/opp/bdc8e589bfe24772a226b98b16239cb7/view",
"setaside": "",
"emails": ["[email protected]"],
},
{
"notice type": "Special Notice",
"solnbr": "ATC1234",
"agency": "DEPT OF DEFENSE",
"compliant": 0,
"office": "DEPT OF THE AIR FORCE",
"attachments": [],
"classcod": None,
"naics": "5",
"subject": "Gartner Licenses",
"url": "https://beta.sam.gov/opp/bdc8e589bfe24772a226b98b16239cb7/view",
"setaside": "",
"emails": ["[email protected]"],
}
]

mock_attachment_data = {
"text": "test",
"filename": "test.txt",
Expand Down
63 changes: 31 additions & 32 deletions tests/test_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
sys.path.append( os.path.dirname( os.path.dirname( os.path.abspath(__file__) ) ) )
from tests.mock_opps import mock_schematized_opp_one
from fbo_scraper.db.db import Notice, NoticeType, Attachment, Model, now_minus_two, Solicitation
from fbo_scraper.db.db_utils import (session_scope, insert_data_into_solicitations_table,
DataAccessLayer, clear_data, object_as_dict, fetch_notice_type_id,
from fbo_scraper.db.db_utils import (insert_data_into_solicitations_table,
clear_data, object_as_dict, fetch_notice_type_id,
insert_model, insert_notice_types, retrain_check,
get_validation_count, get_trained_count,
get_validated_untrained_count, fetch_validated_attachments,
Expand Down Expand Up @@ -67,13 +67,13 @@ def setUp(self):
self.dal.create_test_postgres_db()
self.dal.connect()

with session_scope(self.dal) as session:
with self.dal.Session.begin() as session:
insert_notice_types(session)

self.maxDiff = None

def tearDown(self):
with session_scope(self.dal) as session:
with self.dal.Session.begin() as session:
clear_data(session)
close_all_sessions()
self.dal.drop_test_postgres_db()
Expand All @@ -82,7 +82,7 @@ def tearDown(self):

def test_insert_bad_notice(self):
call_count = 0
with session_scope(self.dal) as session:
with self.dal.Session.begin() as session:
# intentionally bad notice type
data = mock_schematized_opp_one.copy()
data['notice type'] = "not to be found"
Expand All @@ -97,7 +97,7 @@ def test_insert_bad_notice(self):
assert call_count >= 1, "We should get one warning when adding a notice with a new notice type."

def test_insert_notice_types(self):
with session_scope(self.dal) as session:
with self.dal.Session.begin() as session:
insert_notice_types(session)

types = [
Expand All @@ -108,7 +108,7 @@ def test_insert_notice_types(self):
]
notice_type_ids = []
for notice_type in types:
with session_scope(self.dal) as session:
with self.dal.Session.begin() as session:
notice_type_id = (
session.query(NoticeType.id)
.filter(NoticeType.notice_type == notice_type)
Expand All @@ -122,10 +122,10 @@ def test_insert_notice_types(self):
self.assertEqual(result, expected)

def test_insert_data_into_solicitations_table(self):
with session_scope(self.dal) as session:
with self.dal.Session.begin() as session:
insert_data_into_solicitations_table(session, self.data)
result = []
with session_scope(self.dal) as session:
with self.dal.Session.begin() as session:
solicitations = session.query(Solicitation).filter(Solicitation.solNum == self.data[0]['solnbr'])
for s in solicitations:
notice = object_as_dict(s)
Expand Down Expand Up @@ -162,10 +162,10 @@ def test_insert_data_into_solicitations_table_with_new_notice_type(self):
opp = self.data[0].copy()
nnt = "new notice type"
opp["notice type"] = nnt
with session_scope(self.dal) as session:
with self.dal.Session.begin() as session:
insert_data_into_solicitations_table(session, [opp])
result = []
with session_scope(self.dal) as session:
with self.dal.Session.begin() as session:
notices = session.query(Notice).all()
for n in notices:
notice = object_as_dict(n)
Expand All @@ -177,10 +177,10 @@ def test_insert_model(self):
results = {"c": "d"}
params = {"a": "b"}
score = 0.99
with session_scope(self.dal) as session:
with self.dal.Session.begin() as session:
insert_model(session, results=results, params=params, score=score)
result = []
with session_scope(self.dal) as session:
with self.dal.Session.begin() as session:
models = session.query(Model).all()
for m in models:
model = object_as_dict(m)
Expand All @@ -197,53 +197,53 @@ def test_fetch_last_score(self):
results = {"c": "d"}
params = {"a": "b"}
score = 0.99
with session_scope(self.dal) as session:
with self.dal.Session.begin() as session:
insert_model(session, results=results, params=params, score=score)
with session_scope(self.dal) as session:
with self.dal.Session.begin() as session:
score = fetch_last_score(session)
result = score
expected = 0.99
self.assertEqual(result, expected)

def test_get_validation_count(self):
with session_scope(self.dal) as session:
with self.dal.Session.begin() as session:
insert_data_into_solicitations_table(session, self.data)
with session_scope(self.dal) as session:
with self.dal.Session.begin() as session:
result = get_validation_count(session)
expected = 0
self.assertEqual(result, expected)

def test_get_trained_count(self):
with session_scope(self.dal) as session:
with self.dal.Session.begin() as session:
insert_data_into_solicitations_table(session, self.data)
with session_scope(self.dal) as session:
with self.dal.Session.begin() as session:
result = get_trained_count(session)
expected = 0
self.assertEqual(result, expected)

def test_get_validated_untrained_count(self):
result = None
with session_scope(self.dal) as session:
with self.dal.Session.begin() as session:
insert_data_into_solicitations_table(session, self.data)
with session_scope(self.dal) as session:
with self.dal.Session.begin() as session:
result = get_validated_untrained_count(session)
expected = 0
self.assertEqual(result, expected)

def test_retrain_check(self):
result = None
with session_scope(self.dal) as session:
with self.dal.Session.begin() as session:
insert_data_into_solicitations_table(session, self.data)
with session_scope(self.dal) as session:
with self.dal.Session.begin() as session:
result = retrain_check(session)
expected = False
self.assertEqual(result, expected)

def test_fetch_validated_attachments(self):
attachments = None
with session_scope(self.dal) as session:
with self.dal.Session.begin() as session:
insert_data_into_solicitations_table(session, self.data)
with session_scope(self.dal) as session:
with self.dal.Session.begin() as session:
attachments = fetch_validated_attachments(session)
result = len(attachments)
# 993 since that's how many docs were initially labeled
Expand All @@ -252,23 +252,22 @@ def test_fetch_validated_attachments(self):

def test_fetch_solicitations_by_solnbr(self):
notices = None
with session_scope(self.dal) as session:
with self.dal.Session.begin() as session:
insert_data_into_solicitations_table(session, self.data)
with session_scope(self.dal) as session:
with self.dal.Session.begin() as session:
notices = fetch_solicitations_by_solnbr('test', session)
result = len(notices)
expected = 28 # Amount of keys in dict
self.assertEqual(result, expected)

def test_fetch_solicitations_by_solnbr_bogus_solnbr(self):
notices = []
with session_scope(self.dal) as session:
with self.dal.Session.begin() as session:
insert_data_into_solicitations_table(session, self.data)
with session_scope(self.dal) as session:
with self.dal.Session.begin() as session:
notices = fetch_solicitations_by_solnbr("notexist", session)
result = len(notices)
expected = 0
self.assertEqual(result, expected)

self.assertEqual(notices, None)


if __name__ == '__main__':
Expand Down
11 changes: 6 additions & 5 deletions tests/test_db_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from tests.mock_opps import mock_schematized_opp_two
from fbo_scraper.db.db import Notice, NoticeType, Solicitation, Attachment, Model, now_minus_two
from fbo_scraper.db.db_utils import session_scope, insert_data_into_solicitations_table, \
DataAccessLayer, insert_notice_types, update_solicitation_history, search_for_agency, handle_attachments, apply_predictions_to
from fbo_scraper.db.db_utils import insert_data_into_solicitations_table, \
DataAccessLayer, insert_notice_types, update_solicitation_history, search_for_agency, handle_attachments, apply_predictions_to,create_new_or_exisiting_sol, insert_data_into

from fbo_scraper.db.connection import get_db_url

Expand All @@ -28,24 +28,25 @@ def setUp(self):
self.dal.create_test_postgres_db()
self.dal.connect()

with session_scope(self.dal) as session:
with self.dal.Session.begin() as session:
insert_notice_types(session)


def tearDown(self):
with session_scope(self.dal) as session:
with self.dal.Session.begin() as session:
clear_data(session)
close_all_sessions()
self.dal.drop_test_postgres_db()
self.dal = None
self.data = None

def test_insert_data_into_solicitations_table(self):
with session_scope(self.dal) as session:
with self.dal.Session.begin() as session:
try:
insert_data_into_solicitations_table(session, [mock_schematized_opp_two])
except Exception as e:
print (e)


def test_update_solicitation_history():
# Create a mock solicitation object
Expand Down
Loading

0 comments on commit 6823e00

Please sign in to comment.