fix bug where certain papers from LingBuzz don't behave well with par…

…sing (skip them)
Dechrissen · Dec 22, 2020 · 593e3eb · 593e3eb
1 parent 8a6043d
commit 593e3eb
Show file tree

Hide file tree

Showing 4 changed files with 38 additions and 10 deletions.
diff --git a/config.json b/config.json
@@ -1,4 +1,4 @@
 {
   "user_email" : "[email protected]",
-  "keywords" : ["sound"]
+  "keywords" : ["sound", "anaphora"]
 }
diff --git a/create_dataset.py b/create_dataset.py
@@ -17,12 +17,14 @@ def create_csv_copy(filename, check):
     with open(new_filename, 'a') as csvfile:
         filewriter = csv.writer(csvfile, delimiter=',')
         paper = scrapeLingBuzzHomePage(check)
+        if paper.title == 'dummy':
+            return False
         try:
             filewriter.writerow([paper.title, paper.link, paper.authors, paper.abstract, paper.keywords])
             test_title = paper.title
         except UnicodeEncodeError:
-            print('UnicodeEncodeError: \'' + paper.title[:31]+'...\'', 'skipped')
-            quit()
+            print(paper.title[:31]+'...\'', 'skipped')
+            return False
     return test_title
 
 def clean_authors(author_list):

diff --git a/functions.py b/functions.py
@@ -64,11 +64,22 @@ def scrapeLingBuzzHomePage(number_of_paper):
     abstract = str(list(body.children)[5])
 
     # Keywords
-    keywords_tr = list(list(body.children)[6].children)[3]
-    keywords_list_td = list(keywords_tr.children)[1]
-    keywords = keywords_list_td.get_text()
-    keywords = re.split(r'[,|;]', keywords)
-    keywords = [k.strip() for k in keywords]
+    try:
+        keywords_tr = list(list(body.children)[6].children)[3]
+        keywords_list_td = list(keywords_tr.children)[1]
+        keywords = keywords_list_td.get_text()
+        keywords = re.split(r'[,|;]', keywords)
+        keywords = [k.strip() for k in keywords]
+    except:
+        # return dummy paper when keyword list parsing doesn't behave
+        title='dummy'
+        pdf_link='dummy'
+        authors=['dummy']
+        abstract='dummy'
+        keywords = ['dummy']
+        date='dummy'
+        current_paper = Paper(title, pdf_link, authors, abstract, keywords, date)
+        return current_paper
 
     # Construct Paper object
     current_paper = Paper(title, pdf_link, authors, abstract, keywords, date)
@@ -204,3 +215,5 @@ def classifier(text):
 #collected_papers = queryLingBuzz('pokemon')
 #for x in collected_papers:
     #print(x.date)
+#y = queryLingBuzz('That’s a Curious Copular Construction You Have There!')
+#print(y[0].title, y[0].keywords, y[0].abstract, y[0].link, y[0].authors, y[0].date)
diff --git a/recommender.py b/recommender.py
@@ -15,7 +15,11 @@ def recommend(title, cosine_sim, indices, df):
     recommended_papers = {}
 
     # getting the index of the movie that matches the title
-    idx = indices[indices == title].index[0]
+    try:
+        idx = indices[indices == title].index[0]
+    except:
+        # return dummy values if indices doesn't behave (?)
+        return {}, 0.0
 
     # creating a Series with the similarity scores in descending order
     score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)
@@ -40,6 +44,8 @@ def recommend(title, cosine_sim, indices, df):
 def check_new(check):
     # create a copy of `user.csv` with 5 extra papers appended
     test_title = create_csv_copy('user.csv', check)
+    if not test_title:
+        return False
 
 
     # create the dataframe according to `user.csv`
@@ -78,6 +84,9 @@ def check_new(check):
 
 
     recs, score = recommend(test_title, cosine_sim, indices, df)
+    if score == 0.0:
+        # return False if recommend returns 0.0 for score
+        return False
     link = df['Link'][test_title]
     # This is to handle when more than one of the same paper is in the dataframe. str means 1, pandas Series means more than 1
     if type(link) is not str:
@@ -103,7 +112,11 @@ def check_new(check):
 recommendation = None
 # check the 10 newest papers
 for c in range(10):
-    possible_rec, score = check_new(c)
+    try:
+        possible_rec, score = check_new(c)
+    except TypeError:
+        # if check_new returns False, skip current iteration. This is to handle papers which don't behave
+        continue
     if score > highest:
         highest = score
         recommendation = possible_rec