forked from tazeek/BullyDetect
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbalanced_set.py
33 lines (24 loc) · 875 Bytes
/
balanced_set.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import random
import pandas as pd
import numpy as np
# Load in the full dataframe
df = pd.read_csv('clean_dataset.csv')
# Separate true positives and true negatives
bully_df = df[df['Insult'] == 1]
normal_df = df[df['Insult'] == 0]
# Reset index after separating
bully_df.reset_index(inplace=True, drop=True)
normal_df.reset_index(inplace=True, drop=True)
# Get N + 10 number of true negatives
# Sample without replacement
N = len(bully_df) + 10
random_numbers = random.sample(range(0, len(normal_df)), N)
new_normal_df = normal_df.iloc[random_numbers]
# Reset index
new_normal_df.reset_index(inplace=True, drop=True)
# Combine the dataframes
balanced_df = pd.concat([bully_df, new_normal_df])
# Shuffle the balanced dataframe
balanced_df = balanced_df.sample(frac=1).reset_index(drop=True)
# Save the file
balanced_df.to_csv('balanced_dataset.csv', index=False)