Skip to content

Commit

Permalink
Hydrogen v1.0.0
Browse files Browse the repository at this point in the history
  • Loading branch information
bcankara committed Dec 29, 2024
1 parent 9f9f15e commit c810fcf
Show file tree
Hide file tree
Showing 22 changed files with 338,852 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .env
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
SCOPUS_API_KEY=
UNPAYWALL_EMAIL=
360 changes: 360 additions & 0 deletions DataProcessor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,360 @@
import os
import sys
import time
import threading
import pandas as pd
from dotenv import load_dotenv
from Main.modules import (
ensure_dir,
find_files,
find_data_folders,
save_statistics,
save_comprehensive_statistics,
merge_databases_simple,
merge_databases_enhanced,
compare_merge_methods,
create_result_folder
)
from datetime import datetime

class ProcessIndicator:
"""Process indicator for showing progress"""
busy = False
delay = 0.1

@staticmethod
def indicator():
while 1:
for cursor in '|/-\\':
yield cursor

def __init__(self, description="Processing"):
self.description = description
self.generator = self.indicator()
self.busy = False
self.visible = False

def write_next(self):
with self._screen_lock:
if not self.visible:
sys.stdout.write(f"\r{self.description:<50}")
self.visible = True
sys.stdout.write(next(self.generator))
sys.stdout.flush()
sys.stdout.write('\b')

def remove_indicator(self, cleanup=False):
with self._screen_lock:
if self.visible:
sys.stdout.write('\b')
self.visible = False
if cleanup:
sys.stdout.write(f"\r{self.description:<50}[Completed]\n")
sys.stdout.flush()

def indicator_task(self):
while self.busy:
self.write_next()
time.sleep(self.delay)

def __enter__(self):
if sys.stdout.isatty():
self._screen_lock = threading.Lock()
self.busy = True
self.thread = threading.Thread(target=self.indicator_task)
self.thread.start()

def __exit__(self, exc_type, exc_val, exc_tb):
if sys.stdout.isatty():
self.busy = False
time.sleep(self.delay)
self.remove_indicator(cleanup=True)

def process_wos_data(input_file: str, output_file: str) -> tuple[bool, pd.DataFrame, dict]:
"""Process WoS data"""
try:
from Main.wos2xlsx import save_to_excel
success = save_to_excel(input_file, output_file)

if success:
with ProcessIndicator("Converting WoS data to Excel format") as indicator:
df = pd.read_excel(output_file)
stats = {
'Record Count': len(df),
'Column Count': len(df.columns),
'Non-Empty Columns': df.count().to_dict()
}
return True, df, stats
return False, None, {}
except Exception as e:
sys.stderr.write(f"\nError: {str(e)}\n")
return False, None, {}

def process_scopus_data(input_file: str, output_file: str) -> tuple[bool, pd.DataFrame, dict]:
"""Process Scopus data"""
try:
from Main.scp2xlsx import save_to_excel
success = save_to_excel(input_file, output_file)

if success:
with ProcessIndicator("Converting Scopus data to Excel format") as indicator:
df = pd.read_excel(output_file)
stats = {
'Record Count': len(df),
'Column Count': len(df.columns),
'Non-Empty Columns': df.count().to_dict()
}
return True, df, stats
return False, None, {}
except Exception as e:
sys.stderr.write(f"\nError: {str(e)}\n")
return False, None, {}

def merge_txt_files(data_dir: str) -> str:
"""Merge all txt files in data directory into wos_raw.txt in merged_raw folder"""
txt_files = find_files(data_dir, "txt")
if not txt_files:
raise ValueError("No WoS files (txt) found in Data folder.")

# Create merged_raw directory if not exists
merged_raw_dir = os.path.join(data_dir, "merged_raw")
os.makedirs(merged_raw_dir, exist_ok=True)

output_file = os.path.join(merged_raw_dir, "wos_raw.txt")
with open(output_file, 'w', encoding='utf-8') as outfile:
for txt_file in txt_files:
with open(txt_file, 'r', encoding='utf-8') as infile:
outfile.write(infile.read())
outfile.write('\n') # Add newline between files

return output_file

def merge_csv_files(data_dir: str) -> str:
"""Merge all csv files in data directory into scp_raw.csv in merged_raw folder"""
csv_files = find_files(data_dir, "csv")
if not csv_files:
raise ValueError("No Scopus files (csv) found in Data folder.")

# Create merged_raw directory if not exists
merged_raw_dir = os.path.join(data_dir, "merged_raw")
os.makedirs(merged_raw_dir, exist_ok=True)

output_file = os.path.join(merged_raw_dir, "scp_raw.csv")

if len(csv_files) == 1:
# If only one CSV file, just copy it
import shutil
shutil.copy2(csv_files[0], output_file)
else:
# Read and combine all CSV files
all_data = []
for csv_file in csv_files:
df = pd.read_csv(csv_file, encoding='utf-8')
all_data.append(df)

# Concatenate all dataframes
combined_df = pd.concat(all_data, ignore_index=True)

# Save combined data
combined_df.to_csv(output_file, index=False, encoding='utf-8')

return output_file

def main():
try:
# Load environment variables
load_dotenv()

print("Database Merge Tool")
print("------------------")

# Find project folders
workspace_dir = "Workspace"
if not os.path.exists(workspace_dir):
os.makedirs(workspace_dir)

project_folders = [d for d in os.listdir(workspace_dir) if os.path.isdir(os.path.join(workspace_dir, d))]

if not project_folders:
sys.stderr.write("\nError: No project folder found in Workspace.\n")
return

print("\nProject Folders:")
for i, folder in enumerate(project_folders, 1):
print(f"{i}. {folder}")

# Project selection
while True:
try:
choice = int(input("\nSelect project number: "))
if 1 <= choice <= len(project_folders):
project_dir = os.path.join(workspace_dir, project_folders[choice-1])
break
else:
sys.stderr.write("Error: Invalid selection.\n")
except ValueError:
sys.stderr.write("Error: Invalid input. Please enter a number.\n")

# Create Data directory if not exists
data_dir = os.path.join(project_dir, "Data")
if not os.path.exists(data_dir):
os.makedirs(data_dir)

print(f"\nSelected Project: {project_folders[choice-1]}")

# Create unique analysis directory
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
analysis_dir = os.path.join(project_dir, f"Analysis_{timestamp}")
os.makedirs(analysis_dir, exist_ok=True)

# Create subfolders
text_files_dir = os.path.join(analysis_dir, "Text_Files")
cell_files_dir = os.path.join(analysis_dir, "Cell_Files")
os.makedirs(text_files_dir, exist_ok=True)
os.makedirs(cell_files_dir, exist_ok=True)

# Set output files in analysis directory
wos_output = os.path.join(cell_files_dir, "WoS.xlsx")
scopus_output = os.path.join(cell_files_dir, "Scopus.xlsx")
merged_bib = os.path.join(cell_files_dir, "Merged_Bib.xlsx")
merged_vos = os.path.join(text_files_dir, "Merged_Vos.txt")
api_enriched_bib = os.path.join(cell_files_dir, "Merged_API_Enriched_Bib.xlsx")
api_enriched_vos = os.path.join(text_files_dir, "Merged_API_Enriched_Vos.txt")
stats_excel = os.path.join(analysis_dir, "Statistics.xlsx")
api_log = os.path.join(analysis_dir, "Api_Log.txt")
api_updates = os.path.join(analysis_dir, "Api_Update.xlsx")

# Check if merged_raw directory exists and contains required files
merged_raw_dir = os.path.join(data_dir, "merged_raw")
wos_raw = os.path.join(merged_raw_dir, "wos_raw.txt")
scp_raw = os.path.join(merged_raw_dir, "scp_raw.csv")

use_existing = False
if os.path.exists(merged_raw_dir) and os.path.exists(wos_raw) and os.path.exists(scp_raw):
while True:
choice = input("\nPreviously merged raw files found. What would you like to do?\n"
"1. Use previously merged files\n"
"2. Perform new merge\n"
"Your choice (1/2): ")
if choice in ['1', '2']:
use_existing = (choice == '1')
break
else:
sys.stderr.write("Invalid choice. Please enter 1 or 2.\n")

if not use_existing:
if os.path.exists(merged_raw_dir):
print("\nDeleting old merged files...")
import shutil
shutil.rmtree(merged_raw_dir)

# Merge txt files into wos_raw.txt
print("\nMerging process in progress...")
try:
wos_input = merge_txt_files(data_dir)
print(f"WoS files merged: {os.path.basename(wos_input)}")
except ValueError as e:
sys.stderr.write(f"\nError: {str(e)}\n")
return

# Merge csv files into scp_raw.csv
print("\nMerging Scopus files...")
try:
scopus_input = merge_csv_files(data_dir)
print(f"Scopus files merged: {os.path.basename(scopus_input)}")
except ValueError as e:
sys.stderr.write(f"\nError: {str(e)}\n")
return
else:
print("\nUsing existing merged files...")
wos_input = wos_raw
scopus_input = scp_raw

# Process WoS data
print("\nProcessing WoS data...")
wos_success, wos_df, wos_stats = process_wos_data(wos_input, wos_output)
if not wos_success:
sys.stderr.write("\nError: Failed to process WoS data.\n")
return
print("WoS data processing completed.")

# Process Scopus data
print("\nProcessing Scopus data...")
scopus_success, scopus_df, scopus_stats = process_scopus_data(scopus_input, scopus_output)
if not scopus_success:
sys.stderr.write("\nError: Failed to process Scopus data.\n")
return
print("Scopus data processing completed.")

# Get Scopus API key from environment variable
scopus_api_key = os.getenv('SCOPUS_API_KEY')
unpaywall_email = os.getenv('UNPAYWALL_EMAIL')

if not scopus_api_key:
print("\nWarning: Scopus API key not found in .env file.")
print("You can add it to the .env file to enable Scopus metadata enrichment.")
print("Continuing with other data sources...")

if not unpaywall_email:
print("\nWarning: Unpaywall email not found in .env file.")
print("You can add it to the .env file to enable Unpaywall metadata enrichment.")
print("Continuing with other data sources...")

# Enhanced merge
print("\nMerging process in progress...")
enhanced_success, enhanced_stats, enhanced_df = merge_databases_enhanced(
wos_df,
scopus_df,
merged_bib,
scopus_api_key=scopus_api_key,
unpaywall_email=unpaywall_email,
result_dir=analysis_dir
)

print("\nConverting data to VosViewer format...")
from Main.xlsx2vos import convert_excel_to_wos
convert_excel_to_wos(merged_bib, merged_vos)
print("VosViewer conversion completed.")

if enhanced_success:
# Save statistics
print("\nPreparing statistics...")
all_stats = {
'WoS Statistics': wos_stats,
'Scopus Statistics': scopus_stats,
'Merge Statistics': enhanced_stats
}

save_comprehensive_statistics(all_stats, wos_df, scopus_df, enhanced_df, stats_excel)
print("Statistics completed.")

# Check and convert API enriched file
if os.path.exists(api_enriched_bib):
print("\nConverting API Enriched data to VosViewer format...")
convert_excel_to_wos(api_enriched_bib, api_enriched_vos)
print("API Enriched VosViewer conversion completed.")

print("\nProcess completed successfully.")
print(f"\nAnalysis results saved to: {analysis_dir}")
print("Generated Files:")
print(f"1. WoS Data: {os.path.basename(wos_output)}")
print(f"2. Scopus Data: {os.path.basename(scopus_output)}")
print(f"3. Merged Data (Biblioshiny): {os.path.basename(merged_bib)}")
print(f"4. Merged Data (VosViewer): {os.path.basename(merged_vos)}")
print(f"5. Statistics: {os.path.basename(stats_excel)}")
print(f"6. API Log: {os.path.basename(api_log)}")
if os.path.exists(api_updates):
print(f"7. API Updates: {os.path.basename(api_updates)}")
if os.path.exists(api_enriched_bib):
print(f"8. API Enriched Data (Biblioshiny): {os.path.basename(api_enriched_bib)}")
print(f"9. API Enriched Data (VosViewer): {os.path.basename(api_enriched_vos)}")
else:
sys.stderr.write("\nError: Merge process failed.\n")

except Exception as e:
print(f"\nAn unexpected error occurred: {str(e)}")
finally:
print("\nProgram terminating...")

if __name__ == "__main__":
main()
Loading

0 comments on commit c810fcf

Please sign in to comment.