-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathreal_hil_example.py
198 lines (167 loc) · 9 KB
/
real_hil_example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
import os
import random
import time
import numpy as np
import pandas as pd
from tqdm import tqdm
from pprint import pformat
from sklearn.decomposition import PCA
import umap
import hdbscan
from alive_progress import alive_bar
from experiment_setups.real_HiL.core.real_hil_logic import RealHiLLogic
from hil.utils.logger import Logger
from hil.utils.general_helper import save_results, plot_results, load_config, configure_paths, create_directory, load_model
from hil.utils.model_zoo import xcit_gc_nano_12_p8_224_dist
from hil.filter_noisy_images import NoisyClassifier
from hil.data_processor import DataProcessor
def main():
"""
Main function to perform the real Human-in-the-loop experiment (cluster_specific or random).
In the context of multiple hil runs, where each run comprises an 'Initialization' phase followed by a series
of sampling/ relabeling 'loops', we designate the ongoing run as 'current_run'. Similarly, within each hil run,
we refer to the active loop as 'current_loop'.
This experiment is interactive in nature. During each loop (except for the initialization), a gallery in
PDF format, similar to the CellSearch gallery, will be generated in the same folder as this main Python script.
The program will then pause and ask you to assign labels. After labeling, instructions on how to proceed will
be displayed in the terminal. If you stop the relabeling process, please ensure that you provide the same
results path in the corresponding configuration file. This will allow you to resume from the exact point
where you left off.
"""
configure_paths()
# Load experiment configuration from config file
cfg_path = "configs/real_hil_example_cfg.yml"
config = load_config(cfg_path)
# Extract experiment configurations
n_runs = config["HiL_configurations"]["n_runs"] # Number of "hil runs"
n_loops = config["HiL_configurations"]["n_loops"] # "Initialization" and number of sampling & relabeling loops within a hil run
experiment_name = config["experiment_setting"]["experiment_name"] # 2 strategies to choose from: cluster_specific or random
# This is the Backbone we used for the DINO implementation
model = xcit_gc_nano_12_p8_224_dist(in_chans=3, num_classes=0)
model = load_model(model=model, device=config['device'], path=config['general_paths']['state_dict_path'])
# Initialize PCA, UMAP and HDBSCAN
pca = PCA(**config["PCA_parameters"])
umap_ = umap.UMAP(**config["UMAP_parameters"])
clusterer = hdbscan.HDBSCAN(**config["HDBSCAN_parameters"])
# Prepare directories for saving results
save_path = os.path.join(config["general_paths"]["path_to_results_folder"], experiment_name)
create_directory(save_path)
create_directory(os.path.join(save_path, "plots"))
data_save_path = os.path.join(config["general_paths"]["path_to_results_folder"], "data")
create_directory(data_save_path)
# Setup logger
log_file_path = os.path.join(save_path, "real_HiL_logfile.log")
logger = Logger(log_file_path=log_file_path)
logger.info(f"Logger configured to write to {log_file_path}.")
logger.info(f"Directories created at {save_path}.")
logger.debug(f"Initializing real hil experiment with config: {pformat(config)}")
# Set random seed for reproducibility
random.seed(42)
random_seed_lst = random.sample(range(1, 200), n_runs)
########################################################################################
# OPTIONAL:
# The unlabeled train set consists of artefacts such as smeared cells and noisy data
# Therefore: a simple classifier sorts out noisy images
# If your data contains noisy data as well, you can follow the example of this class or construct your own.
# If not, you can skip this part. But make sure, you provide the Dataprocessor a df for the unlabeled train set,
# the corresponding features and labels
noisy_classifier = NoisyClassifier(
root_path=config['general_paths']['root_path'],
data_save_path=data_save_path,
path_to_df_train_noisy_classifier=config['general_paths']['path_to_df_train_noisy_classifier'],
example_path_to_df_of_train_case=config['general_paths']['example_path_to_df_of_train_case'],
train_cases=config['train_cases'],
device=config['device'],
batch_size=config["dataloader_parameters"]["batch_size"],
num_workers=config["dataloader_parameters"]["num_workers"],
model=model,
pca=pca,
predictor_opt=config["OPT"],
pos_label=config["noisy_classifier"]["pos_label"]
)
# train unlabels means labels for the unlabeled train set.
df_unlabeled_train, unlabeled_train_features, train_unlabels = noisy_classifier.sort_out_noisy_images()
########################################################################################
# Class that processes the data (Dataloader, extracts features, creates PCA and UMAP features, clustering)
data_processor = DataProcessor(
root_path=config['general_paths']['root_path'],
data_save_path=data_save_path,
example_path_to_df_of_train_case=config['general_paths']['example_path_to_df_of_train_case'],
example_path_to_df_of_test_case=config['general_paths']['example_path_to_df_of_test_case'],
train_cases=config['train_cases'],
test_cases=config['test_cases'],
device=config['device'],
batch_size=config["dataloader_parameters"]["batch_size"],
num_workers=config["dataloader_parameters"]["num_workers"],
model=model,
pca=pca,
umap_=umap_,
clusterer=clusterer,
df_unlabeled_train=df_unlabeled_train,
unlabeled_train_features=unlabeled_train_features,
train_unlabels=train_unlabels
)
# Dataframes to save the results
df_general_results = pd.DataFrame()
df_cluster_results = pd.DataFrame()
num_of_relabeled_cells = []
# Start with the hil runs
for run in tqdm(range(n_runs)):
current_run = run + 1
random_seed = random_seed_lst[run]
run_path = os.path.join(save_path, f"run_{current_run}")
create_directory(run_path)
# Perform the real-world hil experiment
real_hil = RealHiLLogic(
save_path=run_path,
root_path=config['general_paths']['root_path'],
experiment_name=experiment_name,
run=current_run,
loops=n_loops,
random_seed=random_seed,
class_names=config["HiL_configurations"]["class_names"],
predictor_opt=config["OPT"],
logger=logger,
data_processor=data_processor,
num_of_relabeled_cells=num_of_relabeled_cells,
max_relabeling_pool_size=config["HiL_configurations"]["max_relabeling_pool_size"],
clustering_plot_in_each_loop=config["HiL_configurations"]["clustering_plot_in_each_loop"],
shuffle_after_first_run=config["HiL_configurations"]["shuffle_after_one_HiL_run"]
)
# Call the human-in-the-loop method
real_hil.human_in_the_loop()
########################################################################################
# OPTIONAL:
# As described in the paper, after 1 hil run with the cluster-specific strategy, no cells are left to be
# annotated. Therefore, we can store the number of new labeled samples for each loop in this list.
# The new labeled samples are then shuffled in the remaining repeated hil runs,
# while maintaining the same number of new samples per loop.
# If you need this option, set the parameter shuffle_after_first_run in RealHiLLogic to True
if experiment_name == "cluster_specific" and current_run == 1:
relabeled_cells_idx_lst = real_hil.relabeled_cells_idx_lst
np.save(os.path.join(run_path, "relabeled_cells_idx_lst_run_1.npy"), relabeled_cells_idx_lst)
new_labels = real_hil.new_labels
np.save(os.path.join(run_path, "new_labels.npy"), new_labels)
num_of_relabeled_cells = real_hil.num_of_relabeled_cells
########################################################################################
# Stores results of the current experiment in dataframes
df_general_metrics = real_hil.df_general_metrics
df_general_results = pd.concat([df_general_results, df_general_metrics], ignore_index=True)
df_cluster_metrics = real_hil.df_cluster_metrics
df_cluster_results = pd.concat([df_cluster_results, df_cluster_metrics], ignore_index=True)
# Saves general and cluster results to Excel files
df_cluster_results_for_plotting = save_results(
n_runs=n_runs,
save_path=save_path,
experiment_name=experiment_name,
df_general_results=df_general_results,
df_cluster_results=df_cluster_results
)
# Creates the Line plot
plot_results(
experiment_name=experiment_name,
df_cluster_results_for_plotting=df_cluster_results_for_plotting,
save_path=save_path,
)
if __name__ == '__main__':
main()