Skip to content

Commit

Permalink
getWith -> get
Browse files Browse the repository at this point in the history
  • Loading branch information
jiaojiaodubai committed Aug 21, 2024
1 parent 1353a67 commit c9381b1
Show file tree
Hide file tree
Showing 2 changed files with 96 additions and 96 deletions.
126 changes: 63 additions & 63 deletions CQVIP Knowledge.js
Original file line number Diff line number Diff line change
Expand Up @@ -114,66 +114,66 @@ async function scrape(doc, url = doc.location.href) {
Z.debug(doc.body.innerText);
let creators = [];
let creatorsEn = [];
const labels = new LabelsX(doc, '.article-detail > p');
const labels = new Labels(doc, '.article-detail > p');
const extra = new Extra();
Z.debug(labels.innerData.map(arr => [arr[0], ZU.trimInternal(arr[1].textContent)]));
const newItem = new Z.Item(detectWeb(doc, url));
newItem.title = attr(doc, '.article-summary > h1', 'title').replace(/_\((.+?)\)/g, '<sub>$1</sub>').replace(/_(\d)/g, '<sub>$1</sub>');
extra.set('original-title', text(doc, '.article-summary > em').replace(/_\((.+?)\)/g, '<sub>$1</sub>').replace(/_(\d)/g, '<sub>$1</sub>'), true);
newItem.abstractNote = labels.getWith('摘要');
newItem.abstractNote = labels.get('摘要');
switch (newItem.itemType) {
case 'journalArticle': {
newItem.publicationTitle = attr(doc, 'a[href*="/asset/journal/"]', 'title');
extra.set('original-container-title', tryMatch(labels.getWith('出版物'), /\((\w+)\)$/, 1), true);
const pubInfo = labels.getWith('年卷期');
extra.set('original-container-title', tryMatch(labels.get('出版物'), /\((\w+)\)$/, 1), true);
const pubInfo = labels.get('年卷期');
newItem.volume = tryMatch(pubInfo, /0*(\d+)/, 1);
newItem.issue = tryMatch(pubInfo, /([A-Z\d]+)/, 1).replace(/0*(\d)/, '$1');
newItem.pages = labels.getWith('页码').replace(/\+/g, ', ').replace(/~/g, '-');
newItem.pages = labels.get('页码').replace(/\+/g, ', ').replace(/~/g, '-');
newItem.date = tryMatch(pubInfo, /^(\d{4})/, 1);
creators = Array.from(doc.querySelectorAll('.author > a')).map(element => cleanAuthor(ZU.trimInternal(element.textContent)));
creatorsEn = text(doc, '.author > em').split(/[;]\s*/).map(enName => ZU.capitalizeName(enName));
extra.set('foundation', labels.getWith('基金'));
extra.set('foundation', labels.get('基金'));
break;
}
case 'book':
newItem.series = labels.getWith('丛书名');
newItem.series = labels.get('丛书名');
// http://k.cqvip.com/asset/detail/101996618144
newItem.edition = labels.getWith('版本说明');
newItem.publisher = labels.getWith('出版社');
newItem.date = ZU.strToISO(labels.getWith('出版年'));
newItem.numPages = tryMatch(labels.getWith('页数'), /\d+/);
newItem.ISBN = ZU.cleanISBN(labels.getWith('ISBN'));
newItem.edition = labels.get('版本说明');
newItem.publisher = labels.get('出版社');
newItem.date = ZU.strToISO(labels.get('出版年'));
newItem.numPages = tryMatch(labels.get('页数'), /\d+/);
newItem.ISBN = ZU.cleanISBN(labels.get('ISBN'));
creators = Array.from(doc.querySelectorAll('.author > a')).map(element => cleanAuthor(ZU.trimInternal(element.textContent)));
break;
case 'thesis':
newItem.thesisType = labels.getWith('学位级别') + '学位论文';
newItem.university = labels.getWith('作者单位');
newItem.date = ZU.strToISO(labels.getWith('授予年度'));
newItem.thesisType = labels.get('学位级别') + '学位论文';
newItem.university = labels.get('作者单位');
newItem.date = ZU.strToISO(labels.get('授予年度'));
creators = Array.from(doc.querySelectorAll('.author > a')).map(element => cleanAuthor(ZU.trimInternal(element.textContent)));
labels.getWith('导师姓名').split(/[;]/).forEach(creator => creators.push(cleanAuthor(creator, 'contributor')));
labels.get('导师姓名').split(/[;]/).forEach(creator => creators.push(cleanAuthor(creator, 'contributor')));
break;
case 'conferencePaper':
newItem.date = ZU.strToISO(labels.getWith('会议日期'));
newItem.conferenceName = labels.getWith('会议名称').replace(/^|$/g, '');
newItem.date = ZU.strToISO(labels.get('会议日期'));
newItem.conferenceName = labels.get('会议名称').replace(/^|$/g, '');
creators = Array.from(doc.querySelectorAll('.author > a')).map(element => cleanAuthor(ZU.trimInternal(element.textContent)));
break;
case 'standard':
newItem.title = attr(doc, '.article-summary > h1', 'title').replace(/([\u4e00-\u9fff]) ([\u4e00-\u9fff])/, '$1 $2');
newItem.number = labels.getWith('标准编号');
newItem.date = ZU.strToISO(labels.getWith('发布日期'));
extra.set('applyDate', labels.getWith('实施日期'));
newItem.numPages = tryMatch(labels.getWith('页码'), /\d+/);
extra.set('CCS', labels.getWith('中国标准分类号'));
extra.set('ICS', labels.getWith('国际标准分类号'));
newItem.number = labels.get('标准编号');
newItem.date = ZU.strToISO(labels.get('发布日期'));
extra.set('applyDate', labels.get('实施日期'));
newItem.numPages = tryMatch(labels.get('页码'), /\d+/);
extra.set('CCS', labels.get('中国标准分类号'));
extra.set('ICS', labels.get('国际标准分类号'));
break;
case 'patent':
newItem.patentNumber = labels.getWith('公开号').split(';')[0];
newItem.applicationNumber = labels.getWith('专利申请号').split(';')[0];
newItem.patentNumber = labels.get('公开号').split(';')[0];
newItem.applicationNumber = labels.get('专利申请号').split(';')[0];
newItem.place = newItem.country = patentCountry(newItem.patentNumber || newItem.applicationNumber);
newItem.filingDate = labels.getWith('申请日');
newItem.issueDate = labels.getWith('公开日');
extra.set('Genre', labels.getWith('专利类型'), true);
creators = Array.from(labels.getWith('发明人', true).querySelectorAll('a')).map(element => cleanAuthor(ZU.trimInternal(element.textContent)));
newItem.filingDate = labels.get('申请日');
newItem.issueDate = labels.get('公开日');
extra.set('Genre', labels.get('专利类型'), true);
creators = Array.from(labels.get('发明人', true).querySelectorAll('a')).map(element => cleanAuthor(ZU.trimInternal(element.textContent)));
break;

/*
Expand All @@ -183,41 +183,41 @@ async function scrape(doc, url = doc.location.href) {
*/
case 'statute': {
newItem.title = newItem.title.replace(/\((.+?)\)/, '($1)');
const rank = labels.getWith('效力级别');
const rank = labels.get('效力级别');
if (rank == '党内法规制度' || newItem.title.includes('草案')) {
newItem.itemType = 'report';
newItem.shortTitle = tryMatch(newItem.title, /^(.+).+/, 1);
newItem.date = labels.getWith('颁布日期');
newItem.date = labels.get('颁布日期');
}
else {
if (!labels.getWith('效力级别').includes('法律')) {
if (!labels.get('效力级别').includes('法律')) {
extra.set('Type', 'regulation', true);
}
if (newItem.title.startsWith('中华人民共和国')) {
newItem.shortTitle = newItem.title.substring(7);
}
newItem.publicLawNumber = labels.getWith('发文文号');
newItem.dateEnacted = labels.getWith('颁布日期');
if (labels.getWith('时效性') == '已失效') {
newItem.publicLawNumber = labels.get('发文文号');
newItem.dateEnacted = labels.get('颁布日期');
if (labels.get('时效性') == '已失效') {
extra.set('Status', '已废止', true);
}
}

extra.set('applyDate', labels.getWith('实施日期'));
creators = Array.from(labels.getWith('颁布部门', true).querySelectorAll('a')).map(element => cleanAuthor(ZU.trimInternal(element.textContent)));
extra.set('applyDate', labels.get('实施日期'));
creators = Array.from(labels.get('颁布部门', true).querySelectorAll('a')).map(element => cleanAuthor(ZU.trimInternal(element.textContent)));
break;
}
case 'report':
newItem.reportType = '科技成果报告';
newItem.institution = labels.getWith('完成单位');
newItem.date = labels.getWith('公布年份');
extra.set('achievementType', labels.getWith('成果类别'));
newItem.institution = labels.get('完成单位');
newItem.date = labels.get('公布年份');
extra.set('achievementType', labels.get('成果类别'));
creators = Array.from(doc.querySelectorAll('.author > a')).map(element => cleanAuthor(ZU.trimInternal(element.textContent)));
break;
case 'newspaperArticle':
newItem.publicationTitle = attr(labels.getWith('报纸名称', true), 'a', 'title');
newItem.date = labels.getWith('发表日期');
newItem.pages = labels.getWith('版名版号').replace(/0*(\d)/, '$1');
newItem.publicationTitle = attr(labels.get('报纸名称', true), 'a', 'title');
newItem.date = labels.get('发表日期');
newItem.pages = labels.get('版名版号').replace(/0*(\d)/, '$1');
creators = Array.from(doc.querySelectorAll('.author > a')).map(element => cleanAuthor(ZU.trimInternal(element.textContent)));
break;
}
Expand All @@ -233,63 +233,63 @@ async function scrape(doc, url = doc.location.href) {
}
extra.set('creatorsExt', JSON.stringify(creators));
}
labels.getWith(['主题', '关键词'], true).querySelectorAll('a').forEach(element => newItem.tags.push(ZU.trimInternal(element.textContent)));
extra.set('CLC', labels.getWith('中图分类'));
labels.get(['主题', '关键词'], true).querySelectorAll('a').forEach(element => newItem.tags.push(ZU.trimInternal(element.textContent)));
extra.set('CLC', labels.get('中图分类'));
newItem.extra = extra.toString();
return newItem;
}

class LabelsX {
class Labels {
constructor(doc, selector) {
this.innerData = [];
this.emptyElement = doc.createElement('div');
this.data = [];
this.emptyElm = doc.createElement('div');
Array.from(doc.querySelectorAll(selector))
// avoid nesting
.filter(element => !element.querySelector(selector))
// avoid empty
.filter(element => !/^\s*$/.test(element.textContent))
.forEach((element) => {
const elementCopy = element.cloneNode(true);
const elmCopy = element.cloneNode(true);
// avoid empty text
while (/^\s*$/.test(elementCopy.firstChild.textContent)) {
while (/^\s*$/.test(elmCopy.firstChild.textContent)) {
// Z.debug(elementCopy.firstChild.textContent);
elementCopy.removeChild(elementCopy.firstChild);
elmCopy.removeChild(elmCopy.firstChild);
// Z.debug(elementCopy.firstChild.textContent);
}
if (elementCopy.childNodes.length > 1) {
const key = elementCopy.removeChild(elementCopy.firstChild).textContent.replace(/\s/g, '');
this.innerData.push([key, elementCopy]);
if (elmCopy.childNodes.length > 1) {
const key = elmCopy.removeChild(elmCopy.firstChild).textContent.replace(/\s/g, '');
this.data.push([key, elmCopy]);
}
else {
const text = ZU.trimInternal(elementCopy.textContent);
const text = ZU.trimInternal(elmCopy.textContent);
const key = tryMatch(text, /^[[]?.+?[\]:]/).replace(/\s/g, '');
elementCopy.textContent = tryMatch(text, /^[[]?.+?[\]:]\s*(.+)/, 1);
this.innerData.push([key, elementCopy]);
elmCopy.textContent = tryMatch(text, /^[[]?.+?[\]:]\s*(.+)/, 1);
this.data.push([key, elmCopy]);
}
});
}

getWith(label, element = false) {
get(label, element = false) {
if (Array.isArray(label)) {
const results = label
.map(aLabel => this.getWith(aLabel, element));
.map(aLabel => this.get(aLabel, element));
const keyVal = element
? results.find(element => !/^\s*$/.test(element.textContent))
: results.find(string => string);
return keyVal
? keyVal
: element
? this.emptyElement
? this.emptyElm
: '';
}
const pattern = new RegExp(label, 'i');
const keyVal = this.innerData.find(arr => pattern.test(arr[0]));
const keyVal = this.data.find(arr => pattern.test(arr[0]));
return keyVal
? element
? keyVal[1]
: ZU.trimInternal(keyVal[1].textContent)
: element
? this.emptyElement
? this.emptyElm
: '';
}
}
Expand Down
66 changes: 33 additions & 33 deletions CQVIP.js
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,8 @@ async function doWeb(doc, url) {
}

async function scrape(doc, url = doc.location.href) {
const labels = new LabelsX(doc, '.horizontalData-f, .mainContainerDataList-item');
const doi = labels.getWith('DOI');
const labels = new Labels(doc, '.horizontalData-f, .mainContainerDataList-item');
const doi = labels.get('DOI');
if (doi) {
try {
await scrapeSearch(doi);
Expand Down Expand Up @@ -120,7 +120,7 @@ async function scrape(doc, url = doc.location.href) {
const extra = new Extra();
extra.set('citation', text(doc, '.horizontalData-content'));
extra.set('download', text(doc, '.horizontalData-content', 2));
extra.set('CLC', labels.getWith('中图分类号'));
extra.set('CLC', labels.get('中图分类号'));
item.language = /[\u4e00-\u9fff]/.test(item.title)
? 'zh-CN'
: 'en-US';
Expand All @@ -132,13 +132,13 @@ async function scrape(doc, url = doc.location.href) {
case 'thesis':
item.thesisType = `${item.edition}学位论文`;
delete item.edition;
labels.getWith('导师', true).querySelectorAll('.info-line > span').forEach((element) => {
labels.get('导师', true).querySelectorAll('.info-line > span').forEach((element) => {
item.creators.push(ZU.cleanAuthor(element.textContent, 'contributor'));
});
break;
case 'conferencePaper':
item.conferenceName = labels.getWith('会议名称');
item.place = labels.getWith('会议地点');
item.conferenceName = labels.get('会议名称');
item.place = labels.get('会议地点');
extra.set('organizer', item.publisher, true);
delete item.publisher;
break;
Expand All @@ -153,20 +153,20 @@ async function scrape(doc, url = doc.location.href) {
item.place = item.country = patentCountry(item.patentNumber || item.applicationNumber);
item.assignee = item.issuingAuthority;
delete item.issuingAuthority;
item.filingDate = labels.getWith('申请日');
item.issueDate = labels.getWith('公开\\(公告\\)日');
item.filingDate = labels.get('申请日');
item.issueDate = labels.get('公开\\(公告\\)日');
item.legalStatus = text(doc, '.legalstatus .el-table__row:first-child > td:nth-child(2)');
extra.set('IPC', labels.getWith('IPC分类号'));
item.rights = labels.getWith('主权项');
extra.set('IPC', labels.get('IPC分类号'));
item.rights = labels.get('主权项');
break;
case 'standard':
item.title = item.title
.replace(/([\u4e00-\u9fff]) ([\u4e00-\u9fff])/, '$1 $2')
.replace(/([\u4e00-\u9fff]): ?([\u4e00-\u9fff])/, '$1:$2');
item.number = tryMatch(refText, /^ID (.+)/m, 1);
delete item.publisher;
extra.set('CSC', labels.getWith('中国标准分类号'));
extra.set('ICS', labels.getWith('国际标准分类号'));
extra.set('CSC', labels.get('中国标准分类号'));
extra.set('ICS', labels.get('国际标准分类号'));
break;
}
item.creators.forEach((creator) => {
Expand Down Expand Up @@ -204,57 +204,57 @@ async function scrapeSearch(doi) {
await translator.translate();
}

class LabelsX {
class Labels {
constructor(doc, selector) {
this.innerData = [];
this.emptyElement = doc.createElement('div');
this.data = [];
this.emptyElm = doc.createElement('div');
Array.from(doc.querySelectorAll(selector))
// avoid nesting
.filter(element => !element.querySelector(selector))
// avoid empty
.filter(element => !/^\s*$/.test(element.textContent))
.forEach((element) => {
let elementCopy = element.cloneNode(true);
const elmCopy = element.cloneNode(true);
// avoid empty text
while (/^\s*$/.test(elementCopy.firstChild.textContent)) {
while (/^\s*$/.test(elmCopy.firstChild.textContent)) {
// Z.debug(elementCopy.firstChild.textContent);
elementCopy.removeChild(elementCopy.firstChild);
elmCopy.removeChild(elmCopy.firstChild);
// Z.debug(elementCopy.firstChild.textContent);
}
if (elementCopy.childNodes.length > 1) {
let key = elementCopy.removeChild(elementCopy.firstChild).textContent.replace(/\s/g, '');
this.innerData.push([key, elementCopy]);
if (elmCopy.childNodes.length > 1) {
const key = elmCopy.removeChild(elmCopy.firstChild).textContent.replace(/\s/g, '');
this.data.push([key, elmCopy]);
}
else {
let text = ZU.trimInternal(elementCopy.textContent);
let key = tryMatch(text, /^[[]?.+?[\]:]/).replace(/\s/g, '');
elementCopy.textContent = tryMatch(text, /^[[]?.+?[\]:]\s*(.+)/, 1);
this.innerData.push([key, elementCopy]);
const text = ZU.trimInternal(elmCopy.textContent);
const key = tryMatch(text, /^[[]?.+?[\]:]/).replace(/\s/g, '');
elmCopy.textContent = tryMatch(text, /^[[]?.+?[\]:]\s*(.+)/, 1);
this.data.push([key, elmCopy]);
}
});
}

getWith(label, element = false) {
get(label, element = false) {
if (Array.isArray(label)) {
let results = label
.map(aLabel => this.getWith(aLabel, element));
let keyVal = element
const results = label
.map(aLabel => this.get(aLabel, element));
const keyVal = element
? results.find(element => !/^\s*$/.test(element.textContent))
: results.find(string => string);
return keyVal
? keyVal
: element
? this.emptyElement
? this.emptyElm
: '';
}
let pattern = new RegExp(label, 'i');
let keyVal = this.innerData.find(arr => pattern.test(arr[0]));
const pattern = new RegExp(label, 'i');
const keyVal = this.data.find(arr => pattern.test(arr[0]));
return keyVal
? element
? keyVal[1]
: ZU.trimInternal(keyVal[1].textContent)
: element
? this.emptyElement
? this.emptyElm
: '';
}
}
Expand Down

0 comments on commit c9381b1

Please sign in to comment.