Skip to content

Commit

Permalink
fix bug where certain papers from LingBuzz don't behave well with par…
Browse files Browse the repository at this point in the history
…sing (skip them)
  • Loading branch information
Dechrissen committed Dec 22, 2020
1 parent 8a6043d commit 593e3eb
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 10 deletions.
2 changes: 1 addition & 1 deletion config.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{
"user_email" : "[email protected]",
"keywords" : ["sound"]
"keywords" : ["sound", "anaphora"]
}
6 changes: 4 additions & 2 deletions create_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,14 @@ def create_csv_copy(filename, check):
with open(new_filename, 'a') as csvfile:
filewriter = csv.writer(csvfile, delimiter=',')
paper = scrapeLingBuzzHomePage(check)
if paper.title == 'dummy':
return False
try:
filewriter.writerow([paper.title, paper.link, paper.authors, paper.abstract, paper.keywords])
test_title = paper.title
except UnicodeEncodeError:
print('UnicodeEncodeError: \'' + paper.title[:31]+'...\'', 'skipped')
quit()
print(paper.title[:31]+'...\'', 'skipped')
return False
return test_title

def clean_authors(author_list):
Expand Down
23 changes: 18 additions & 5 deletions functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,11 +64,22 @@ def scrapeLingBuzzHomePage(number_of_paper):
abstract = str(list(body.children)[5])

# Keywords
keywords_tr = list(list(body.children)[6].children)[3]
keywords_list_td = list(keywords_tr.children)[1]
keywords = keywords_list_td.get_text()
keywords = re.split(r'[,|;]', keywords)
keywords = [k.strip() for k in keywords]
try:
keywords_tr = list(list(body.children)[6].children)[3]
keywords_list_td = list(keywords_tr.children)[1]
keywords = keywords_list_td.get_text()
keywords = re.split(r'[,|;]', keywords)
keywords = [k.strip() for k in keywords]
except:
# return dummy paper when keyword list parsing doesn't behave
title='dummy'
pdf_link='dummy'
authors=['dummy']
abstract='dummy'
keywords = ['dummy']
date='dummy'
current_paper = Paper(title, pdf_link, authors, abstract, keywords, date)
return current_paper

# Construct Paper object
current_paper = Paper(title, pdf_link, authors, abstract, keywords, date)
Expand Down Expand Up @@ -204,3 +215,5 @@ def classifier(text):
#collected_papers = queryLingBuzz('pokemon')
#for x in collected_papers:
#print(x.date)
#y = queryLingBuzz('That’s a Curious Copular Construction You Have There!')
#print(y[0].title, y[0].keywords, y[0].abstract, y[0].link, y[0].authors, y[0].date)
17 changes: 15 additions & 2 deletions recommender.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,11 @@ def recommend(title, cosine_sim, indices, df):
recommended_papers = {}

# getting the index of the movie that matches the title
idx = indices[indices == title].index[0]
try:
idx = indices[indices == title].index[0]
except:
# return dummy values if indices doesn't behave (?)
return {}, 0.0

# creating a Series with the similarity scores in descending order
score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)
Expand All @@ -40,6 +44,8 @@ def recommend(title, cosine_sim, indices, df):
def check_new(check):
# create a copy of `user.csv` with 5 extra papers appended
test_title = create_csv_copy('user.csv', check)
if not test_title:
return False


# create the dataframe according to `user.csv`
Expand Down Expand Up @@ -78,6 +84,9 @@ def check_new(check):


recs, score = recommend(test_title, cosine_sim, indices, df)
if score == 0.0:
# return False if recommend returns 0.0 for score
return False
link = df['Link'][test_title]
# This is to handle when more than one of the same paper is in the dataframe. str means 1, pandas Series means more than 1
if type(link) is not str:
Expand All @@ -103,7 +112,11 @@ def check_new(check):
recommendation = None
# check the 10 newest papers
for c in range(10):
possible_rec, score = check_new(c)
try:
possible_rec, score = check_new(c)
except TypeError:
# if check_new returns False, skip current iteration. This is to handle papers which don't behave
continue
if score > highest:
highest = score
recommendation = possible_rec
Expand Down

0 comments on commit 593e3eb

Please sign in to comment.