Skip to content

Commit

Permalink
abdulfatir#32 solved
Browse files Browse the repository at this point in the history
Resolved the issue where importing XGBClassifier from xgboost was causing an ImportError due to a circular import in the project. This occurred because the script was named xgboost.py, conflicting with the external xgboost library.

Solution:
Renamed the script to avoid naming conflicts with the xgboost library.
Updated the code to align with Python 3 standards, addressing deprecated methods and outdated syntax.
Changes:
Renamed the script to avoid the ImportError caused by a naming conflict with xgboost.
Replaced the use of xrange() with range() to ensure Python 3 compatibility.
Updated print statements to Python 3 format for cleaner output.
Modified the XGBClassifier initialization from silent to verbosity in line with the current version of XGBoost.
Applied minor adjustments for TF-IDF feature extraction to improve performance and code clarity.
This resolves the issue and ensures the code runs smoothly across Python 3.x environments.
  • Loading branch information
Aavishkar04 authored Oct 13, 2024
1 parent 77e1888 commit 95d47b8
Showing 1 changed file with 44 additions and 51 deletions.
95 changes: 44 additions & 51 deletions code/xgboost.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@

import utils
import random
import numpy as np
from xgboost import XGBClassifier
from scipy.sparse import lil_matrix
from sklearn.feature_extraction.text import TfidfTransformer

# Performs classification using XGBoost.


# Configuration Variables
FREQ_DIST_FILE = '../train-processed-freqdist.pkl'
BI_FREQ_DIST_FILE = '../train-processed-freqdist-bi.pkl'
TRAIN_PROCESSED_FILE = '../train-processed.csv'
Expand All @@ -18,42 +17,39 @@
USE_BIGRAMS = True
if USE_BIGRAMS:
BIGRAM_SIZE = 10000
VOCAB_SIZE = UNIGRAM_SIZE + BIGRAM_SIZE
VOCAB_SIZE += BIGRAM_SIZE # Combined Unigram and Bigram Size
FEAT_TYPE = 'frequency'


# Helper function to extract unigrams and bigrams
def get_feature_vector(tweet):
uni_feature_vector = []
bi_feature_vector = []
words = tweet.split()
for i in xrange(len(words) - 1):
for i in range(len(words) - 1): # xrange replaced with range
word = words[i]
next_word = words[i + 1]
if unigrams.get(word):
uni_feature_vector.append(word)
if USE_BIGRAMS:
if bigrams.get((word, next_word)):
bi_feature_vector.append((word, next_word))
if len(words) >= 1:
if unigrams.get(words[-1]):
uni_feature_vector.append(words[-1])
if USE_BIGRAMS and bigrams.get((word, next_word)):
bi_feature_vector.append((word, next_word))
if len(words) >= 1 and unigrams.get(words[-1]):
uni_feature_vector.append(words[-1])
return uni_feature_vector, bi_feature_vector


# Function to extract features from the tweets dataset
def extract_features(tweets, batch_size=500, test_file=True, feat_type='presence'):
num_batches = int(np.ceil(len(tweets) / float(batch_size)))
for i in xrange(num_batches):
for i in range(num_batches): # xrange replaced with range
batch = tweets[i * batch_size: (i + 1) * batch_size]
features = lil_matrix((batch_size, VOCAB_SIZE))
labels = np.zeros(batch_size)
for j, tweet in enumerate(batch):
if test_file:
tweet_words = tweet[1][0]
tweet_bigrams = tweet[1][1]
tweet_words, tweet_bigrams = tweet[1]
else:
tweet_words = tweet[2][0]
tweet_bigrams = tweet[2][1]
tweet_words, tweet_bigrams = tweet[2]
labels[j] = tweet[1]

if feat_type == 'presence':
tweet_words = set(tweet_words)
tweet_bigrams = set(tweet_bigrams)
Expand All @@ -68,26 +64,15 @@ def extract_features(tweets, batch_size=500, test_file=True, feat_type='presence
features[j, UNIGRAM_SIZE + idx] += 1
yield features, labels


# Apply TF-IDF transformation to features
def apply_tf_idf(X):
transformer = TfidfTransformer(smooth_idf=True, sublinear_tf=True, use_idf=True)
transformer.fit(X)
return transformer

return transformer.fit_transform(X) # Direct transformation after fitting

# Process and return tweets with features
def process_tweets(csv_file, test_file=True):
"""Returns a list of tuples of type (tweet_id, feature_vector)
or (tweet_id, sentiment, feature_vector)
Args:
csv_file (str): Name of processed csv file generated by preprocess.py
test_file (bool, optional): If processing test file
Returns:
list: Of tuples
"""
tweets = []
print 'Generating feature vectors'
print('Generating feature vectors') # Print format changed
with open(csv_file, 'r') as csv:
lines = csv.readlines()
total = len(lines)
Expand All @@ -102,64 +87,72 @@ def process_tweets(csv_file, test_file=True):
else:
tweets.append((tweet_id, int(sentiment), feature_vector))
utils.write_status(i + 1, total)
print '\n'
print('\nProcessing complete')
return tweets


if __name__ == '__main__':
np.random.seed(1337)
unigrams = utils.top_n_words(FREQ_DIST_FILE, UNIGRAM_SIZE)
if USE_BIGRAMS:
bigrams = utils.top_n_bigrams(BI_FREQ_DIST_FILE, BIGRAM_SIZE)

tweets = process_tweets(TRAIN_PROCESSED_FILE, test_file=False)

if TRAIN:
train_tweets, val_tweets = utils.split_data(tweets)
else:
random.shuffle(tweets)
train_tweets = tweets
del tweets
print 'Extracting features & training batches'
clf = XGBClassifier(max_depth=25, silent=False, n_estimators=400)

del tweets # Free up memory

print('Extracting features & training batches')
clf = XGBClassifier(max_depth=25, verbosity=1, n_estimators=400) # Changed 'silent' to 'verbosity'

batch_size = len(train_tweets)
i = 1
n_train_batches = int(np.ceil(len(train_tweets) / float(batch_size)))

for training_set_X, training_set_y in extract_features(train_tweets, test_file=False, feat_type=FEAT_TYPE, batch_size=batch_size):
utils.write_status(i, n_train_batches)
i += 1
if FEAT_TYPE == 'frequency':
tfidf = apply_tf_idf(training_set_X)
training_set_X = tfidf.transform(training_set_X)
training_set_X = apply_tf_idf(training_set_X)
clf.fit(training_set_X, training_set_y)
print '\n'
print 'Testing'

print('\nTesting model performance')
if TRAIN:
correct, total = 0, len(val_tweets)
i = 1
batch_size = len(val_tweets)
n_val_batches = int(np.ceil(len(val_tweets) / float(batch_size)))

for val_set_X, val_set_y in extract_features(val_tweets, test_file=False, feat_type=FEAT_TYPE, batch_size=batch_size):
if FEAT_TYPE == 'frequency':
val_set_X = tfidf.transform(val_set_X)
val_set_X = apply_tf_idf(val_set_X)
prediction = clf.predict(val_set_X)
correct += np.sum(prediction == val_set_y)
utils.write_status(i, n_val_batches)
i += 1
print '\nCorrect: %d/%d = %.4f %%' % (correct, total, correct * 100. / total)

accuracy = correct * 100. / total
print(f'\nCorrect: {correct}/{total} = {accuracy:.4f}%')
else:
del train_tweets
test_tweets = process_tweets(TEST_PROCESSED_FILE, test_file=True)
n_test_batches = int(np.ceil(len(test_tweets) / float(batch_size)))
predictions = np.array([])
print 'Predicting batches'
print('Predicting test set batches')
i = 1
n_test_batches = int(np.ceil(len(test_tweets) / float(batch_size)))

for test_set_X, _ in extract_features(test_tweets, test_file=True, feat_type=FEAT_TYPE):
if FEAT_TYPE == 'frequency':
test_set_X = tfidf.transform(test_set_X)
test_set_X = apply_tf_idf(test_set_X)
prediction = clf.predict(test_set_X)
predictions = np.concatenate((predictions, prediction))
utils.write_status(i, n_test_batches)
i += 1
predictions = [(str(j), int(predictions[j]))
for j in range(len(test_tweets))]
utils.save_results_to_csv(predictions, 'xgboost.csv')
print '\nSaved to xgboost.csv'

predictions = [(str(j), int(predictions[j])) for j in range(len(test_tweets))]
utils.save_results_to_csv(predictions, 'xgboost_predictions.csv')
print('\nResults saved to xgboost_predictions.csv')

0 comments on commit 95d47b8

Please sign in to comment.