-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSocArXiv_csv_list_generator.py
140 lines (117 loc) · 5.7 KB
/
SocArXiv_csv_list_generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import csv
import requests
import re
from pylatexenc.latex2text import LatexNodes2Text
import streamlit as st
from io import StringIO
# Set the title of the Streamlit app
st.title('SocArXiv Preprints Link Generator')
# Provide information about the app
st.write(
'''
\n Version 1.1.0.
\n This app allows you to generate a list of SocArXiv preprints based on the selected subject and title search.
Simply choose a subject from the dropdown list, enter keywords in the search box, and click on 'Search Preprints',
to generate a CSV file with a list containing authors, titles, subjects, and download links to the pdf-files.
\n When the search is complete, a download button for the CSV file appears.
\n Searching across all subjects might take some time. Feel free to fetch yourself a drink in the meantime.
\n Unfortunately, the Streamlit server sometimes crashes. In this case, you may consider downloading the code from my GitHub Repo and running it on a local host.
GitHub Repo: https://github.com/ramyologist/SocArXiv-CSV-Generator
Cheers,
ramyologist
'''
)
# Define the base URL for the SocArXiv API
base_url = "https://api.osf.io/v2/providers/preprints/socarxiv/preprints/"
# Define the URL for fetching subject data
subject_url = "https://api.osf.io/v2/providers/preprints/socarxiv/taxonomies/"
# Function to fetch all available subjects from the SocArXiv API
def fetch_all_subjects():
subjects = []
page = 1
while True:
params = {"page[size]": 100, "page": page}
response = requests.get(subject_url, params=params)
data = response.json()
if "data" not in data or not data["data"]:
break
subjects.extend([subject["attributes"]["text"] for subject in data["data"]])
page += 1
return subjects
# Fetch all available subjects
available_subjects = fetch_all_subjects()
# Add "All Subjects" as an option at the beginning of the subject list
available_subjects.insert(0, "All Subjects")
# Create a dropdown to select a subject
subject_filter = st.selectbox("Select a subject:", available_subjects)
# Function to fetch metadata for a given DOI
def fetch_doi_metadata(doi):
headers = {
"Accept": "application/x-bibtex"
}
response = requests.get(f"http://dx.doi.org/{doi}", headers=headers)
return response.text if response.status_code == 200 else None
# Function to extract year and author information from BibTeX metadata
def extract_bibtex_metadata(bibtex_str):
year_match = re.search(r"year\s*=\s*{(\d{4})", bibtex_str)
author_match = re.search(r"author\s*=\s*{([^}]+)", bibtex_str)
year = year_match.group(1) if year_match else None
author = author_match.group(1) if author_match else None
# Convert LaTeX-encoded author names to text
def latex_to_unicode(latex_str):
return LatexNodes2Text().latex_to_text(latex_str)
if author:
author = latex_to_unicode(author)
return year, author
# Function to fetch preprints for a specific subject and retrieve author and year information
def fetch_subject_preprints_with_year_and_author(subject_filter, search_keywords, progress_text, progress_bar):
preprints = []
page = 1
total_preprints = 0
while True:
params = {
"page[size]": 100,
"page": page,
"filter[subjects]": subject_filter if subject_filter != "All Subjects" else None,
}
response = requests.get(base_url, params=params)
data = response.json()
new_preprints_count = len(data.get("data", []))
if not new_preprints_count:
break
for preprint in data["data"]:
title = preprint["attributes"]["title"]
osf_link = preprint["links"]["preprint_doi"].replace("https://doi.org/10.31235/osf.io/", "https://osf.io/preprints/socarxiv/") + "/"
doi_link = f"https://doi.org/10.31235/osf.io/{osf_link.split('/')[-2]}"
download_link = f"{osf_link}download"
if search_keywords.lower() in title.lower(): # Filter by search keywords in title
metadata = fetch_doi_metadata(doi_link)
_, author = extract_bibtex_metadata(metadata) if metadata else (None, None)
preprints.append((author, title, subject_filter, osf_link, download_link))
total_preprints += 1
progress_text.text(f"Total Preprints Fetched: {total_preprints}")
progress_bar.progress(total_preprints/15000)
page += 1
return preprints
# Function to create a CSV file containing selected preprints' columns
def create_csv_with_selected_columns(preprints):
csv_file = StringIO()
writer = csv.writer(csv_file)
writer.writerow(["Author", "Title", "Subject", "OSF Link", "Download Link"])
for author, title, subject, osf_link, download_link in preprints:
writer.writerow((author, title, subject, osf_link, download_link))
return csv_file.getvalue()
# Create a text input for searching keywords in titles
search_keywords = st.text_input("Search Keywords in Titles")
# If the "Search Preprints" button is clicked
if st.button("Search Preprints"):
progress_text = st.empty()
progress_bar = st.progress(0.0)
# Fetch and search preprints based on the selected subject and title keywords
preprints = fetch_subject_preprints_with_year_and_author(subject_filter, search_keywords, progress_text, progress_bar)
# Create a CSV file containing preprints' information
csv_content = create_csv_with_selected_columns(preprints)
# Remove progress bar
progress_bar.empty()
# Add a download button for the CSV file
st.download_button("Download CSV", csv_content, f"preprints_by_title_and_subject.csv", "text/csv")