-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbuildmetadatadb.sql
123 lines (81 loc) · 6.45 KB
/
buildmetadatadb.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
-- db optimization
PRAGMA temp_store_directory = '.';
pragma page_size=16384;
pragma default_cache_size=3000;
pragma journal_size_limit = 10000000;
pragma legacy_file_format = false;
pragma synchronous = 1 ;
pragma auto_vacuum=2;
-- import and preparation
create temp table tempinput
as select id, j2s(authorsurnames) as authorsurnames,j2s(authorfirstnames) as authorfirstnames, title, publisher, publicationYear as year from (setschema 'id, authorsurnames,authorfirstnames, title, publisher, publicationYear' select jdictsplit(c1, 'id', 'authorsurnames','authorfirstnames' , 'title', 'publisher', 'publicationYear') from stdinput());
create temp table metadata
as select id, authorsurnames, authorfirstnames, stripchars(normalizetext(lower(title)),"_") as title, publisher, year from tempinput where title like "% % %";
-- triple creation
create temp table triples
as select id,middle as words, comprspaces(j2s(prev,middle,next)) as title from (setschema 'id, prev, middle, next' select id,textwindow2s(title,15,3,15) from metadata) group by id, words, title;
-- singularly distinguished titles
create temp table uniquesplittitles
as select words,titles,ids from (select words,jgroup(title) as titles, jgroup(id) as ids ,count(distinct(id)) as num from triples group by words) where num = 1;
create temp table uniquetitleswithtriples
as select words,titles,ids from (select max(length(words)) as len,words,titles,ids from uniquesplittitles group by ids);
create index uniquetitleswithtriples_index
on uniquetitleswithtriples(words,titles,ids);
delete from triples where id in (select ids as id from uniquesplittitles group by id);
-- plurally distinguished titles
create temp table splittitles
as select words,titles,ids from (select words,jgroup(title) as titles, jgroup(id) as ids ,count(distinct(id)) as num from triples group by words)
where num <= (select * from (select jgroupuniquelimit(ids,n,(select count(distinct id) from triples)/2) from (select jgroup(id) as ids,count(id) as n from triples group by words order by n asc)));
delete from triples where id in (select id from (select jsplitv(ids) as id from splittitles) group by id);
insert into splittitles
select words,titles,ids from (select words,jgroup(title) as titles, jgroup(id) as ids ,count(distinct(id)) as num from triples group by words)
where num <= (select * from (select jgroupuniquelimit(ids,n,(select count(distinct id) from triples)/2) from (select jgroup(id) as ids,count(id) as n from triples group by words order by n asc)));
delete from triples where id in (select id from (select jsplitv(ids) as id from splittitles) group by id);
insert into splittitles
select words,titles,ids from (select words,jgroup(title) as titles, jgroup(id) as ids ,count(distinct(id)) as num from triples group by words);
create temp table titleswithtriplesmaxwords
as select words,titles,ids from (select max(length(words)) as len,words,titles,ids from splittitles group by ids);
-- id bag of words creation and normalization
create temp table bagofwordsforids
as select id, title, year,lower(publisher) as publisher,lower(authorsurnames) as authorsurnames, lower(authorfirstnames) as authorfirstnames from metadata;
update bagofwordsforids
set year = jmergeregexp(jset(s2j(comprspaces(filterstopwords(regexpr('\b\w{1,3}\b',regexpr('\W|_',year,' '),'')))))), publisher = jmergeregexp(jset(s2j(comprspaces(filterstopwords(regexpr('\b\w{1,3}\b',regexpr('\W|_',publisher,' '),'')))))), authorsurnames = jmergeregexp(jset(s2j(comprspaces(filterstopwords(regexpr('\b\w{1,3}\b',regexpr('\W|_',authorsurnames,' '),'')))))),authorfirstnames = jmergeregexp(jset(s2j(comprspaces(filterstopwords(regexpr('\b\w{1,3}\b',regexpr('\W|_',authorfirstnames,' '),''))))));
create index bagofwords_index on bagofwordsforids(id,title,authorsurnames,year,authorfirstnames,publisher);
-- Connect singularly distinguished titles with their bag of words
create temp table uniquetitleswithtriples_bagofwords
as select id,titles,words , year,publisher,authorsurnames,authorfirstnames from uniquetitleswithtriples, bagofwordsforids where ids = id;
create index uniquetitleswithtriples_bagofwords_index
on uniquetitleswithtriples_bagofwords(words,titles,year,publisher,authorsurnames,authorfirstnames);
-- Extract singularly distinguished titles from multiple distinguished titles, connect with bag of words and insert into unique titles
insert into uniquetitleswithtriples_bagofwords
select id, titles, words , year, publisher, authorsurnames,authorfirstnames
from (select * from titleswithtriplesmaxwords where jlen(titles)=1), bagofwordsforids
where ids = id ;
delete from titleswithtriplesmaxwords where jlen(titles) = 1;
-- Split vertically multiple distinguished titles
create temp table titles
as select words,titles from titleswithtriplesmaxwords;
create temp table idswords
as select words,ids from titleswithtriplesmaxwords;
create temp table jsplittitles
as select words,jsplitv(titles) as titles from titles;
create temp table jsplitids
as select words,jsplitv(ids) as ids from idswords;
create temp table jsplittitlesids
as select jsplitids.words as words,jsplitids.ids as ids,jsplittitles.titles from jsplitids,jsplittitles where jsplittitles.rowid = jsplitids.rowid;
create temp table distincttitlesidswords
as select words,titles,ids from (select max(length(words)),words,titles,ids from jsplittitlesids group by titles);
create temp table distincttitlesidswords_bag
as select id, titles,words,year,publisher,authorsurnames,authorfirstnames from distincttitlesidswords,bagofwordsforids where distincttitlesidswords.ids = bagofwordsforids.id;
create index distincttitlesidswords_index
on distincttitlesidswords_bag(words,titles,year,publisher,authorsurnames,authorfirstnames);
-- Merge everything into one table
insert into distincttitlesidswords_bag
select * from uniquetitleswithtriples_bagofwords;
create temp table titlesandtriples1
as select id,titles,words,year,publisher,authorsurnames,authorfirstnames from (select id, max(length(words)),words,year,publisher,authorsurnames,authorfirstnames,titles from distincttitlesidswords_bag group by titles);
drop table if exists titlesandtriples;
create table titlesandtriples
as select id, lower(titles) as titles, lower(words) as words,year,publisher,authorsurnames,authorfirstnames from (select id, max(length(words)),words,year,publisher,authorsurnames,authorfirstnames,titles from titlesandtriples1 group by id);
create index titlestriples_index
on titlesandtriples(words,titles,year,publisher,authorsurnames,authorfirstnames);