-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsearchEngine.js
113 lines (101 loc) · 3.32 KB
/
searchEngine.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
/**
* @param {string} word
* @returns string
*/
const getNormalizedWord = (word) => word.match(/\w+/g)[0];
/**
* @param {number} matchedDocsCount
* @param {number} docsCount
*/
const getInverseDocumentFrequency = (matchedDocsCount, docsCount) => {
return Math.log10(docsCount / matchedDocsCount).toPrecision(2);
};
/**
* @param {string} word
* @param {string[]} phrases
* @returns number
*/
const getNumberOfMatches = (word, phrases) => {
let counter = 0;
phrases.forEach((phrase) => {
if (word === phrase) {
counter += 1;
}
});
return counter;
};
/**
* @param {{id: string, text: string}[]} docs
* @returns {{}}
*/
const getInvertedIndex = (docs) => docs.reduce((acc, { id, text }) => {
const pureWords = text.split(' ').map(getNormalizedWord);
const wordsToDocs = pureWords.reduce((coll, word) => {
const matchesCount = getNumberOfMatches(word, pureWords);
const wordsCount = pureWords.length;
const termFrequency = Number((matchesCount / wordsCount).toPrecision(2));
return { ...coll, [word]: { docs: [{ id, termFrequency }] } };
}, {});
let newAcc = { ...acc };
Object.keys(wordsToDocs).forEach((key) => {
const docsContainsWord = wordsToDocs[key];
let newDocs = [];
if (newAcc[key]) {
newDocs = [...newAcc[key].docs];
newAcc = { ...newAcc, [key]: { docs: [...docsContainsWord.docs, ...newDocs] } };
} else {
newAcc = { ...newAcc, [key]: { docs: [...docsContainsWord.docs, ...newDocs] } };
}
});
return newAcc;
}, {});
/**
* @param {{id: string, text: string}[]} documents
* @returns {{search: (arg0: string) => string[]}}
*/
const buildSearchEngine = (documents) => ({
/**
* @param {string} searchPhrase
* @returns {string[]}
*/
search: (searchPhrase) => {
if (documents.length === 0) {
return [];
}
const phrases = searchPhrase.split(' ');
const purePhrases = phrases.map(getNormalizedWord);
const indexedWords = getInvertedIndex(documents);
const wordsWithIdf = Object.keys(indexedWords).reduce((acc, word) => {
const matchedDocsCount = indexedWords[word].docs.length;
const docsCount = documents.length;
const idf = getInverseDocumentFrequency(matchedDocsCount, docsCount);
const { docs } = indexedWords[word];
return { ...acc, [word]: { docs, idf } };
}, {});
const matchedDocs = purePhrases.reduce((acc, phrase) => {
if (wordsWithIdf[phrase]) {
const { idf, docs } = wordsWithIdf[phrase];
const idfiedDocs = docs.map(({ id, termFrequency }) => {
const tfIdf = (termFrequency * idf).toPrecision(2);
return { id, tfIdf: Number(tfIdf) };
});
return [...acc, ...idfiedDocs];
}
return acc;
}, []);
const relevancedDocs = matchedDocs.reduce((acc, doc) => {
const { id, tfIdf } = doc;
if (acc[id]) {
const newTfIdf = (acc[id] + tfIdf).toPrecision(2);
return { ...acc, [id]: Number(newTfIdf) };
}
return { ...acc, [id]: tfIdf };
}, {});
const docsWithAggregatedIdf = Object.keys(relevancedDocs).reduce((acc, key) => {
return [...acc, { id: key, tfIdf: relevancedDocs[key] }];
}, []);
docsWithAggregatedIdf.sort((first, next) => next.tfIdf - first.tfIdf);
return docsWithAggregatedIdf.map(({ id }) => id);
},
});
module.exports = buildSearchEngine;