-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfind_license.py
290 lines (239 loc) · 9.94 KB
/
find_license.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
from bs4 import BeautifulSoup
import re
from urllib.parse import urlparse
from typing import Optional
import logging
from find_shared import get_base_url_from_soup
logger = logging.getLogger(__name__)
# License patterns dictionary
LICENSE_PATTERNS = {
'cc-by': [
r'creative\s*commons\s*attribution',
r'cc\s*by\b',
],
'cc-by-nc': [
r'creative\s*commons\s*attribution\s*-?\s*non\s*commercial',
r'cc\s*by\s*-?\s*nc\b',
],
'cc-by-sa': [
r'creative\s*commons\s*attribution\s*-?\s*share\s*alike',
r'cc\s*by\s*-?\s*sa\b',
],
'cc-by-nc-sa': [
r'creative\s*commons\s*attribution\s*-?\s*non\s*commercial\s*-?\s*share\s*alike',
r'cc\s*by\s*-?\s*nc\s*-?\s*sa\b',
],
'cc-by-nc-nd': [
r'creative\s*commons\s*attribution\s*-?\s*non\s*commercial\s*-?\s*no\s*derivatives',
r'cc\s*by\s*-?\s*nc\s*-?\s*nd\b',
],
}
# Text patterns to search for license info
TEXT_PATTERNS = [
r"(creativecommons.org/licenses/[a-z\-]+)",
r"distributed under the terms (.*?) which permits",
r"This is an open access article under the terms (.*?) which permits",
r"This is an open-access article distributed under the terms (.*?), where it is permissible",
r"This is an open access article published under (.*?) which permits",
r'<div class="openAccess-articleHeaderContainer(.*?)</div>',
r'this article is published under the creative commons (.*?) licence',
r'This work is licensed under a Creative Commons (.*?) which permits',
r'licensed under a Creative Commons (.*?) License',
r'under Creative Commons (.*?) license',
r'distributed under a Creative Commons (.*?) License',
r'текст статьи',
r'Available under( the)? Creative Commons (.*?) License',
r'This is an Open Access article distributed under (.*?) license',
]
def find_license_in_html(page_content: str) -> Optional[str]:
"""Find license information in HTML content using BeautifulSoup."""
try:
soup = BeautifulSoup(page_content, 'html.parser')
# Check if we should trust this publisher
if not _trust_publisher_license(soup):
logger.info("Publisher not trusted for license information")
return None
# Get the potential text that might contain license info
license_text = page_potential_license_text(page_content)
if not license_text:
return None
# First look for Creative Commons license URLs
cc_url_match = re.search(r'creativecommons\.org/licenses/([a-z-]+)', license_text, re.IGNORECASE)
if cc_url_match:
license_code = cc_url_match.group(1)
if license_code:
return license_code
# Then look for license text patterns
for pattern in TEXT_PATTERNS:
matches = re.findall(pattern, license_text, re.IGNORECASE)
if matches:
normalized_license = find_normalized_license(matches[0])
if normalized_license:
return normalized_license
return "unspecified-oa"
# Check for specific publisher license patterns
publisher_license = check_publisher_specific_licenses(soup, license_text)
if publisher_license:
return publisher_license
return None
except Exception as e:
logger.error(f"Error finding license: {str(e)}")
return None
def check_publisher_specific_licenses(soup: BeautifulSoup, license_text: str) -> Optional[str]:
"""Check for publisher-specific license indicators."""
base_url = get_base_url_from_soup(soup)
if not base_url:
return None
hostname = urlparse(base_url).hostname or ''
# Publisher-specific patterns
publisher_patterns = [
(r'sciencedirect\.com/', r'<div class="OpenAccessLabel">open access</div>', 'unspecified-oa'),
(r'projecteuclid\.org/', r'<strong>Full-text: Open access</strong>', 'unspecified-oa'),
(r'journals\.ametsoc\.org/', r'src="/templates/jsp/_style2/_ams/images/access_free\.gif"', 'unspecified-oa'),
(r'cambridge\.org/', r'<span[^>]*class="open-access"[^>]*>Open access</span>', 'unspecified-oa'),
(r'degruyter\.com/', r'<span>Open Access</span>', 'unspecified-oa'),
]
for url_pattern, text_pattern, license_type in publisher_patterns:
if re.search(url_pattern, hostname, re.IGNORECASE):
if re.search(text_pattern, license_text, re.IGNORECASE):
return license_type
# Special handling for T&F license tab
if re.match(r'^https?://(?:www\.)?tandfonline\.com/doi/full/(10\..+)', base_url, re.IGNORECASE):
tf_license = check_tandfonline_license(base_url)
if tf_license:
return tf_license
return None
def check_tandfonline_license(url: str) -> Optional[str]:
"""Check T&F license tab for license info."""
try:
if url_match := re.match(r'^https?://(?:www\.)?tandfonline\.com/doi/full/(10\..+)', url, re.IGNORECASE):
license_tab_url = f'https://www.tandfonline.com/action/showCopyRight?doi={url_match.group(1)}'
logger.info(f'Checking T&F license tab: {license_tab_url}')
# Note: You'll need to implement the actual HTTP request and parsing
# This is just the structure
return None
except Exception as e:
logger.error(f"Error checking T&F license: {str(e)}")
return None
def page_potential_license_text(page: str) -> Optional[str]:
"""Get text that might contain license info using BeautifulSoup."""
try:
soup = BeautifulSoup(page, 'html.parser')
# Additional sections to remove that might have false positives
selectors_to_remove = [
"div.view-pnas-featured",
"div.references",
"div.footnotes",
"div.supplementary",
"div.table-of-content",
"div.author-notes",
"div.related-content",
".citation-references",
# Adding missing sections from original
"div.citedBySection",
"div#supplementary-material",
"section#article-references",
"div.relatedItem",
"div.table-of-content",
]
for selector in selectors_to_remove:
for element in soup.select(selector):
element.decompose()
# Get specific sections that often contain license info
license_sections = []
# Check meta tags first
meta_tags = ['dc.rights', 'prism.copyright', 'citation_license',
'dc.accessRights', 'dc.rights.license'] # Added missing meta tags
license_meta = soup.find('meta', {'name': meta_tags})
if license_meta and license_meta.get('content'):
license_sections.append(license_meta['content'])
# Extended list of license containers
license_containers = soup.select(
'div.license, '
'div.permissions, '
'div.copyright-notice, '
'div.article-license, '
'section.license, '
'div[id*="license"], '
'div[class*="license"], '
'div[id*="copyright"], '
'div[class*="copyright"], '
# Adding missing containers
'div.openAccessLabel, '
'div.article-header__access, '
'span.pdf-notice'
)
for container in license_containers:
license_sections.append(container.get_text(' ', strip=True))
# Combine and clean text
if license_sections:
combined_text = ' '.join(license_sections)
return ' '.join(combined_text.split())
# If no specific sections found, try full page content
main_content = soup.find(['main', 'article'])
if main_content:
return main_content.get_text(' ', strip=True)
body = soup.find('body')
if body:
return body.get_text(' ', strip=True)
return page
except Exception as e:
logger.error(f"Error parsing HTML: {str(e)}")
return page
def find_normalized_license(text: str) -> Optional[str]:
"""
Convert license text to a normalized format.
Args:
text: License text to normalize
Returns:
str: Normalized license string or None
"""
text = text.lower().strip()
# Check for Creative Commons URL pattern first
cc_url_match = re.search(r'creativecommons\.org/licenses/([a-z-]+)', text)
if cc_url_match:
license_code = cc_url_match.group(1)
if license_code in LICENSE_PATTERNS:
return license_code
# Check each license pattern
for normalized_license, patterns in LICENSE_PATTERNS.items():
if any(re.search(pattern, text, re.IGNORECASE) for pattern in patterns):
return normalized_license
return None
def _trust_publisher_license(soup: BeautifulSoup) -> bool:
"""
Check if publisher's license info should be trusted.
Args:
soup: BeautifulSoup object
Returns:
bool: Whether to trust the publisher
"""
base_url = get_base_url_from_soup(soup)
if not base_url:
return True
hostname = urlparse(base_url).hostname
if not hostname:
return True
# Publishers known to have unreliable license info
untrusted_hosts = {
'indianjournalofmarketing.com',
'rnajournal.cshlp.org',
'press.umich.edu',
'genome.cshlp.org',
'medlit.ru',
'journals.eco-vector.com',
'alife-robotics.co.jp',
'molbiolcell.org',
'jcog.com.tr',
'aimsciences.org',
'berghahnjournals.com',
'ojs.ual.es',
}
# Special case for rupress.org
if hostname.endswith('rupress.org'):
volume_match = re.findall(r'rupress\.org/jcb/[^/]+/(\d+)', base_url)
try:
return bool(volume_match and int(volume_match[0]) < 217)
except ValueError:
return False
return not any(hostname.endswith(host) for host in untrusted_hosts)