diff --git a/ltr/train/expansions.py b/ltr/train/expansions.py deleted file mode 100644 index bedd7c0..0000000 --- a/ltr/train/expansions.py +++ /dev/null @@ -1,64 +0,0 @@ -import json -from elasticsearch import Elasticsearch -from elasticsearch import TransportError - -def formatExpansion(keywords, minDocCount=3, searchField='text_all', expandField='text_all', shardSize=100): - from jinja2 import Template - template = Template(open("gatherExpansion.json.jinja").read()) - jsonStr = template.render(keywords=keywords, - minDocCount=minDocCount, - searchField=searchField, - expandField=expandField, - shardSize=shardSize) - return json.loads(jsonStr) - - -def getExpansions(es, keywords, minDocCount=3, searchField='text_all', expandField='text_all', - shardSize=100, index='tmdb'): - return "" - try: - query = formatExpansion(keywords, minDocCount=minDocCount, searchField=searchField, expandField=expandField, - shardSize=shardSize) - print("Query %s" % json.dumps(query)) - results = es.search(index=index, body=query) - rVal = "" - for sigTerm in results['aggregations']['over_top_n']['expansions']['buckets']: - term = sigTerm['key'] - multiTerm = term.split() - if len(multiTerm) > 1: - term = '"' + " ".join(multiTerm) + '"' - rVal += " %s^%s" % (term, sigTerm['score']) - return rVal - except json.JSONDecodeError as e: - return "" - - except TransportError as e: - print("Query %s" % json.dumps(query)) - print("Query Error: %s " % e.error) - print("More Info : %s " % e.info) - raise e - -def expansionTextAllBigrams(es, keywords): - return getExpansions(es, keywords, expandField='text_all.bigramed', minDocCount=1) - -def expansionTextAllTrigrams(es, keywords): - return getExpansions(es, keywords, expandField='text_all.trigramed', minDocCount=1) - -def expansionTextAll(es, keywords): - return getExpansions(es, keywords, expandField='text_all', minDocCount=1) - -def expansionTitle(es, keywords): - return getExpansions(es, keywords, expandField='title', minDocCount=1) - -def expansionGenre(es, keywords): - return getExpansions(es, keywords, expandField='genres.name', minDocCount=1) - -if __name__ == "__main__": - from sys import argv - import configparser - config = configparser.ConfigParser() - config.read('settings.cfg') - esUrl = config['DEFAULT']['ESHost'] - - es = Elasticsearch(esUrl, timeout=1000) - print(getExpansions(es, argv[1], expandField="text_all.bigramed")) diff --git a/ltr/train/indexMlTmdb.py b/ltr/train/indexMlTmdb.py deleted file mode 100644 index 30ebd97..0000000 --- a/ltr/train/indexMlTmdb.py +++ /dev/null @@ -1,49 +0,0 @@ -import json - -def enrich(movie): - """ Enrich for search purposes """ - if 'title' in movie: - movie['title_sent'] = 'SENTINEL_BEGIN ' + movie['title'] - if 'overview' in movie and movie['overview'] is not None: - movie['overview_sent'] = 'SENTINEL_BEGIN ' + movie['overview'] - -def reindex(es, movieDict={}, index='tmdb', esUrl='http://localhost:9200'): - import elasticsearch.helpers - settings = json.load(open('schema.json')) - - es.indices.delete(index, ignore=[400, 404]) - es.indices.create(index, body=settings) - - def bulkDocs(movieDict): - for id, movie in movieDict.items(): - if 'release_date' in movie and movie['release_date'] == "": - del movie['release_date'] - - movie['title_len'] = 0 - if 'title' in movie: - movie['title_len'] = len(movie['title']) - - enrich(movie) - addCmd = {"_index": index, #E - "_type": "movie", - "_id": id, - "_source": movie} - yield addCmd - if 'title' in movie: - print("%s added to %s" % (movie['title'], index)) - - elasticsearch.helpers.bulk(es, bulkDocs(movieDict)) - -if __name__ == "__main__": - import configparser - from elasticsearch import Elasticsearch - from sys import argv - - config = configparser.ConfigParser() - config.read('settings.cfg') - esUrl=config['DEFAULT']['ESHost'] - if len(argv) > 1: - esUrl = argv[1] - es = Elasticsearch(esUrl, timeout=30) - movieDict = json.loads(open('tmdb.json').read()) - reindex(es, movieDict=movieDict, esUrl=esUrl) diff --git a/ltr/train/movielens.py b/ltr/train/movielens.py deleted file mode 100644 index 226e75c..0000000 --- a/ltr/train/movielens.py +++ /dev/null @@ -1,68 +0,0 @@ - -import json -from elasticsearch import Elasticsearch - -def formatTopMlens(keywords, searchField='text_all', expandField='mlensId', shardSize=10): - from jinja2 import Template - template = Template(open("mlensIds.json.jinja").read()) - jsonStr = template.render(keywords=keywords, - searchField=searchField, - expandField=expandField, - shardSize=shardSize) - return json.loads(jsonStr) - - -def getTopMlensIds(es, keywords, searchField='text_all', index='tmdb'): - query = formatTopMlens(keywords, searchField=searchField) - print("Query %s" % json.dumps(query)) - results = es.search(index=index, body=query) - results = es.search(index=index, body=query) - rVal = [] - for sigTerm in results['aggregations']['over_top_n']['mlens']['buckets']: - mlensId = sigTerm['key'] - rVal.append(mlensId) - return rVal - - -def formatMlensExpansion(mlensIds, minDocCount=1, expandField='liked_movies', shardSize=10): - from jinja2 import Template - template = Template(open("mlensExpansion.json.jinja").read()) - jsonStr = template.render(mlensIds=json.dumps(mlensIds), - expandField=expandField, - minDocCount=minDocCount, - shardSize=shardSize) - return json.loads(jsonStr) - - -def getExpansions(es, mlensIds, minDocCount=1, expandField='liked_movies', - shardSize=10, index='movielens'): - return "" - try: - query = formatMlensExpansion(mlensIds=mlensIds, minDocCount=minDocCount, expandField=expandField, - shardSize=shardSize) - print("Query %s" % json.dumps(query)) - results = es.search(index=index, body=query) - rVal = "" - for sigTerm in results['aggregations']['over_top_n']['expansions']['buckets']: - term = sigTerm['key'] - multiTerm = term.split() - if len(multiTerm) > 1: - term = '"' + " ".join(multiTerm) + '"' - rVal += " %s^%s" % (term, sigTerm['score']) - return rVal - except json.decoder.JSONDecodeError as e: - print("DID NOT DECODE %s" % query) - return "" - - -def expansionMlens(es, keywords): - return "" - esMlens = Elasticsearch('http://ec2-54-234-184-186.compute-1.amazonaws.com:9616', timeout=1000) - topMlens = getTopMlensIds(es, keywords=keywords, searchField="title", index="tmdb") - return getExpansions(es=esMlens, mlensIds=topMlens, expandField="liked_movies", shardSize=10) - - -if __name__ == "__main__": - es = Elasticsearch() - from sys import argv - print(expansionMlens(es, argv[1])) diff --git a/ltr/train/rate.py b/ltr/train/rate.py deleted file mode 100644 index 9b9f893..0000000 --- a/ltr/train/rate.py +++ /dev/null @@ -1,155 +0,0 @@ -from esUrlParse import parseUrl -from judgments import Judgment, judgmentsFromFile, judgmentsToFile, judgmentsByQid -from elasticsearch import Elasticsearch, TransportError -import json - -def formatSearch(keywords): - from jinja2 import Template - template = Template(open("rateSearch.json.jinja").read()) - jsonStr = template.render(keywords=keywords) - return json.loads(jsonStr) - -def getPotentialResults(esUrl, keywords): - (esUrl, index, searchType) = parseUrl(esUrl) - es = Elasticsearch(esUrl) - - query = formatSearch(keywords) - try: - print("Query %s" % json.dumps(query)) - results = es.search(index=index, body=query) - return results['hits']['hits'] - except TransportError as e: - print("Query %s" % json.dumps(query)) - print("Query Error: %s " % e.error) - print("More Info : %s " % e.info) - raise e - - - -def gradeResults(results, keywords, qid): - titleField = 'title' - overviewField = 'overview' - ratings = [] - print("Rating %s results" % len(results)) - for result in results: - grade = None - if 'fields' not in result: - if '_source' in result: - result['fields'] = result['_source'] - if 'fields' in result: - print("") - print("") - print("## %s %s " % (result['fields'][titleField], result['_id'])) - print("") - print(" %s " % result['fields'][overviewField]) - print(" %s " % (" ".join([cast['name'] for cast in result['fields']['cast']]))) - while grade not in ["0", "1", "2", "3", "4"]: - grade = input("Rate this shiznit (0-4) ") - judgment = Judgment(int(grade), qid=qid, keywords=keywords, docId=result['_id']) - ratings.append(judgment) - - return ratings - - -def loadJudgments(judgFile): - currJudgments = [] - existingKws = set() - lastQid = 0 - try: - currJudgments = [judg for judg in judgmentsFromFile(judgFile)] - existingKws = set([judg.keywords for judg in currJudgments]) - judgDict = judgmentsByQid(currJudgments) - judgProfile = [] - for qid, judglist in judgDict.items(): - judgProfile.append((judglist[0], len(judglist))) - judgProfile.sort(key=lambda j: j[1], reverse=True) - for prof in judgProfile: - print("%s has %s judgments" % (prof[0].keywords, prof[1])) - - lastQid = currJudgments[-1].qid - except FileNotFoundError: - pass - - return (currJudgments, existingKws, lastQid) - - -def handleKeywords(inputKws, currJudgments): - - keywordsWithExpansion = inputKws.split(';') - keywordsWithSearchInstead = inputKws.split(';;') - keywords = keywordsWithExpansion[0] - searchWith = keywords - if (len(keywordsWithExpansion) > 1): - searchWith += " %s" % keywordsWithExpansion[1] - if (len(keywordsWithSearchInstead) > 1): - searchWith = keywordsWithSearchInstead[1] - - existingQid = -1 - thisQueryJudgments = [] - if keywords in existingKws: - for judgment in currJudgments: - if judgment.keywords == keywords: - thisQueryJudgments.append(judgment) - existingQid = judgment.qid - - return keywords, searchWith, thisQueryJudgments, existingQid - - -def foldInNewRatings(fullJudgments, origJudgments, newJudgs): - for newJudg in newJudgs: - wasAnUpdate = False - for origJudg in origJudgments: - if (origJudg.sameQueryAndDoc(newJudg)): - origJudg.grade = newJudg.grade - wasAnUpdate = True - if not wasAnUpdate: - fullJudgments.append(newJudg) - - - -if __name__ == "__main__": - """ - Prompts console user for judgments - Usage python rate.py ratingsFileName - - Prompt guide - foo -- searches for "foo" using rateSearch.json.jinja, - foo; bar -- rate keyword "foo", but add "bar" to the query - foo;; bar -- rate keyword "foo", searching for "bar" instead - - """ - from sys import argv - import configparser - - config = configparser.ConfigParser() - config.read('settings.cfg') - esUrl = config['DEFAULT']['ESHost'] - - judgFile = argv[1] - fullJudgments, existingKws, lastQid = loadJudgments(judgFile) - - keywords = "-" - newQid = lastQid + 1 - while len(keywords) > 0: - inputKws = input("Enter the Keywords ('GTFO' to exit) ") - - if inputKws == "GTFO": - break - - keywords, searchWith, origQueryJudgments, existingQid = handleKeywords(inputKws, fullJudgments) - currQid = 0 - if existingQid > 0: - currQid = existingQid - print("Updating judgments for qid:%s" % currQid) - else: - existingKws.add(keywords) - currQid = newQid - print("New Keywords %s qid:%s" % (keywords, currQid)) - newQid += 1 - - results = getPotentialResults(esUrl, searchWith) - newQueryJudgments = gradeResults(results, keywords, currQid) - - foldInNewRatings(fullJudgments, origQueryJudgments, newQueryJudgments) - - judgmentsToFile(judgFile, fullJudgments) diff --git a/ltr/train/searchJustLtr.py b/ltr/train/searchJustLtr.py deleted file mode 100644 index 0c3afe7..0000000 --- a/ltr/train/searchJustLtr.py +++ /dev/null @@ -1,48 +0,0 @@ -baseQuery = { - "size": 5, - "query": { - "sltr": { - "params": { - "keywords": "", - "expansions": "" - }, - "model": "", - } - } -} - -def ltrQuery(keywords, modelName): - import json - from expansions import expansionTextAllBigrams, expansionTextAll, expansionGenre - #baseQuery['query']['sltr']['params']['expansions_text_all_bigrams'] = expansionTextAllBigrams(es, keywords) - #baseQuery['query']['sltr']['params']['expansions_text_all'] = expansionTextAll(es, keywords) - #baseQuery['query']['sltr']['params']['expansions_genre'] = expansionGenre(es, keywords) - baseQuery['query']['sltr']['params']['keywords'] = keywords - baseQuery['query']['sltr']['model'] = model - print("%s" % json.dumps(baseQuery)) - return baseQuery - - -if __name__ == "__main__": - import configparser - from sys import argv - from elasticsearch import Elasticsearch - - config = configparser.ConfigParser() - config.read('settings.cfg') - esUrl=config['DEFAULT']['ESHost'] - - es = Elasticsearch(esUrl, timeout=1000) - model = "test_6" - if len(argv) > 2: - model = argv[2] - keywords = argv[1] - results = es.search(index='tmdb', doc_type='movie', body=ltrQuery(keywords, model)) - for result in results['hits']['hits']: - print("%s " % (result['_source']['title'])) - print("%s " % (result['_score'])) - print("%s " % (result['_source']['vote_average'])) - print("%s " % (result['_source']['vote_count'])) - print("%s " % (result['_source']['overview'])) - print("---------------------------------------") - diff --git a/ltr/train/train.py b/ltr/train/train.py index b1e9170..fa9d1f1 100644 --- a/ltr/train/train.py +++ b/ltr/train/train.py @@ -32,44 +32,6 @@ def partitionJudgments(judgments, testProportion=0.1): return (trainJudgments, testJudgments) - - -def saveModel(esHost, scriptName, featureSet, modelFname): - """ Save the ranklib model in Elasticsearch """ - import requests - import json - from urllib.parse import urljoin - modelPayload = { - "model": { - "name": scriptName, - "model": { - "type": "model/ranklib", - "definition": { - } - } - } - } - - # Force the model cache to rebuild - path = "_ltr/_clearcache" - fullPath = urljoin(esHost, path) - print("POST %s" % fullPath) - resp = requests.post(fullPath) - if (resp.status_code >= 300): - print(resp.text) - - with open(modelFname) as modelFile: - modelContent = modelFile.read() - path = "_ltr/_featureset/%s/_createmodel" % featureSet - fullPath = urljoin(esHost, path) - modelPayload['model']['model']['definition'] = modelContent - print("POST %s" % fullPath) - resp = requests.post(fullPath, json.dumps(modelPayload)) - print(resp.status_code) - if (resp.status_code >= 300): - print(resp.text) - - if __name__ == "__main__": HUMAN_JUDGMENTS = 'movie_judgments.txt' @@ -90,8 +52,7 @@ def saveModel(esHost, scriptName, featureSet, modelFname): solrColl = SolrColl(solrUrl) - - # Load features into Elasticsearch + # Load features into Solr solrColl.reloadFeatures(features=eachFeature()) # Parse a judgments print("-Parse judgments...") @@ -101,7 +62,7 @@ def saveModel(esHost, scriptName, featureSet, modelFname): print("-Train test split") trainJudgments, testJudgments = partitionJudgments(movieJudgments, testProportion=0.1) - # Use proposed Elasticsearch queries (1.json.jinja ... N.json.jinja) to generate a training set + # Use proposed Solr queries (1.json... N.json) to generate a training set # output as "sample_judgments_wfeatures.txt" print("-Log Features") logFeatures(solrColl, judgmentsByQid=movieJudgments)