-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrecommender.py
136 lines (103 loc) · 4.25 KB
/
recommender.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
from functions import *
from create_dataset import *
from sklearn.metrics.pairwise import cosine_similarity
#from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import numpy as np
from statistics import mean
import nltk
def recommend(title, cosine_sim, indices, df):
"""Function that takes paper title as input and returns the top 5 recommended papers."""
recommended_papers = {}
# getting the index of the movie that matches the title
try:
idx = indices[indices == title].index[0]
except:
# return dummy values if indices doesn't behave (?)
return {}, 0.0
# creating a Series with the similarity scores in descending order
score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)
#print(type(score_series))
#print(len(score_series))
# getting the indexes of the 5 most similar papers
top_5_indexes = list(score_series.iloc[1:5].index)
average_score = mean(score_series)
#print('avg:', average_score)
#highest_score = float(score_series.iloc[1])
#print(score_series.iloc[1:5].index)
#print(score_series.iloc[1:5])
# populating the list with the titles of the best 5 matching papers
for i in top_5_indexes:
recommended_papers[list(df.index)[i]] = df['Link'][i]
return recommended_papers, average_score
def check_new(check):
# create a copy of `user.csv` with 5 extra papers appended
test_title = create_csv_copy('user.csv', check)
if not test_title:
return False
# create the dataframe according to `user.csv`
df = create_df()
# merge the dataframe's columns (authors, abstract, keywords) into
# one bag_of_words column
df = merge_df(df)
#print('sample cell:', df['bag_of_words'][3])
# set the dataframe's Title column as index
df.set_index('Title', inplace=True)
#print(df.head())
# instantiate and generate the TF-IDF matrix
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(df['bag_of_words'])
# create series for indexes
indices = pd.Series(df.index)
#print(indices[:5])
# create series for PDF links
lnx = pd.Series(df['Link'])
# generate the cosine similarity matrix from the TF-IDF matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
#print(cosine_sim)
# save the matrix into .csv
#np.savetxt('similarity_matrix.csv', cosine_sim, delimiter=',')
# Reading the .csv into an array (for later use)
#cosine_sim = np.genfromtxt("similarity_matrix.csv", delimiter=",")
recs, score = recommend(test_title, cosine_sim, indices, df)
if score == 0.0:
# return False if recommend returns 0.0 for score
return False
link = df['Link'][test_title]
# This is to handle when more than one of the same paper is in the dataframe. str means 1, pandas Series means more than 1
if type(link) is not str:
link = df['Link'][test_title][0]
possible_rec = 'Recommendation: ' + test_title + '\nLink to PDF: ' + link
return possible_rec, score
#####################################
######## Program begins here ########
#####################################
print('Downloading nltk dependencies ...')
nltk.download('stopwords')
nltk.download('punkt')
print('Done')
# numpy format for printing floats
np.set_printoptions(formatter={'float': lambda x: "{0:0.8f}".format(x)})
print('Building user model ...')
# construct a .csv file containing all papers found on LingBuzz from querying
# with user-entered keywords
create_csv()
print('Checking new uploads against user model ...')
highest = 0
recommendation = None
# check the 10 newest papers
for c in range(10):
try:
possible_rec, score = check_new(c)
except TypeError:
# if check_new returns False, skip current iteration. This is to handle papers from LingBuzz which don't behave when being parsed
continue
if score > highest:
highest = score
recommendation = possible_rec
print(recommendation)
#title = 'A wug-shaped curve in sound symbolism: The case of Japanese Pokémon names'
#for r in recs.keys():
#print(r)
#print(recs[r])