forked from alfifutuhi13/miniproject1
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathRepo1 Sales Analysis.py
258 lines (142 loc) · 5.83 KB
/
Repo1 Sales Analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
#!/usr/bin/env python
# coding: utf-8
# ### Import Necessary Libraries
# In[1]:
import pandas as pd
import os
# #### Task 1: Merging 12 months of sales data into a single csv file.
# In[2]:
df = pd.read_csv("D:\Self\Online Course\Solve real world Data science task\Pandas-Data-Science-Tasks-master\Pandas-Data-Science-Tasks-master\SalesAnalysis\Sales_Data\Sales_April_2019.csv")
files = [file for file in os.listdir("D:\Self\Online Course\Solve real world Data science task\Pandas-Data-Science-Tasks-master\Pandas-Data-Science-Tasks-master\SalesAnalysis\Sales_Data")]
all_months_data = pd.DataFrame() #Creating empty dataframe called 'all_month_data'
for file in files:
df = pd.read_csv("D:\Self\Online Course\Solve real world Data science task\Pandas-Data-Science-Tasks-master\Pandas-Data-Science-Tasks-master\SalesAnalysis\Sales_Data/"+file)
all_months_data = pd.concat([all_months_data, df]) #Merging to the previous empty dataframe
#Checking the result
all_months_data.to_csv("all_data.csv", index=False) #single csv file contain 12 months data.
# #### Reading an updated dataframe
# In[3]:
all_data=pd.read_csv("all_data.csv")
all_data.head()
# #### Task 2: Add "Month" and "Sales" Column
# In[4]:
#Removing Nan Values in our data
all_data=all_data.dropna(how='all')
#Removing rows based on condition, finding 'Or' and delete it
all_data = all_data[all_data['Order Date'].str[0:2]!='Or']
#Add "Month" Column
all_data['Month'] = all_data['Order Date'].str[0:2] #Get the first 2 characters.
all_data['Month'] = all_data['Month'].astype('int32') #turning the data from string to integer
#Convert 'Quantity Ordered' and 'Price Each' to numeric
all_data['Quantity Ordered'] = pd.to_numeric(all_data['Quantity Ordered']) #Becoming integer
all_data['Price Each'] = pd.to_numeric(all_data['Price Each']) #Becoming float
#Add "Sales" Column
all_data['Sales'] = all_data['Quantity Ordered']*all_data['Price Each']
all_data.head()
# ### Question 1: What was the best month for sales? How much was earned that month?
# In[5]:
#all_data.groupby('Month').sum()
# #### Importing Matplotlib
# In[6]:
import matplotlib.pyplot as plt
# #### Visualizing our results
# In[7]:
months = range(1,13) #For x axes
results = all_data.groupby('Month').sum()
plt.bar(months, results['Sales'])
plt.xticks(months)
labels, location = plt.yticks()
plt.yticks(labels, (labels/1000000).astype(int)) #Scaling in million USD
plt.ylabel('Sales in million USD')
plt.xlabel('Month Number')
plt.show()
# ### Question 2: What city sold the most product?
# #### Task 3: Add a "City" Column
# In[8]:
#Function
def get_city(address):
return address.split(',')[1]
def get_state(address):
return address.split(',')[2].split(' ')[1]
#Extract the city and the state
all_data['City'] = all_data['Purchase Address'].apply(lambda x: get_city(x) + ' ' + get_state(x))
all_data.head()
# In[9]:
results2 = all_data.groupby('City').sum()
results2
# In[10]:
#We've already import the matplotlib
#Fixing the cities order
#cities = all_data['City'].unique()
cities = [city for city, df in all_data.groupby('City')]
plt.bar(cities, results2['Sales'])
plt.xticks(cities, rotation='vertical', size = 8)
labels, location = plt.yticks()
plt.yticks(labels, (labels/1000000).astype(int)) #Scaling in million USD
plt.ylabel('Sales in million USD')
plt.xlabel('City Name')
plt.show()
# ### Question 3: What time should we display advertisements to maximize likelihood of customer's buying product?
# #### Task 4: Aggregate the period in 24-hours distribution
# In[11]:
#Create new column in date-time Object (DTO)
all_data['Order_Date_DTO'] = pd.to_datetime(all_data['Order Date'])
#Extraction the hours data
all_data['Hour'] = all_data['Order_Date_DTO'].dt.hour
all_data.head()
# In[12]:
#Plotting
results3 = all_data.groupby(['Hour'])['Quantity Ordered'].count()
hours = [hour for hour, df in all_data.groupby('Hour')]
plt.plot(hours, results3)
plt.xticks(hours)
plt.xlabel('Hour')
plt.ylabel('Number of Orders')
plt.grid()
plt.show()
# ### Question 4: What products are most often sold together?
# #### Task 5: Make a new column called "Product Bundle"
# In[18]:
#Make a new dataframe to seperate the duplicated values of Order ID
new_all = all_data[all_data['Order ID'].duplicated(keep=False)]
#Joining few products with the same Order ID into the same line.
new_all['Product_Bundle'] = new_all.groupby('Order ID')['Product'].transform(lambda x: ','.join(x))
#Dropping the duplicate values
new_all = new_all[['Order ID','Product_Bundle']].drop_duplicates()
new_all.head()
# #### Task 6: Counting the Product bundles
# In[24]:
#Importing libraries
from itertools import combinations
from collections import Counter
count = Counter()
for row in new_all['Product_Bundle']:
row_list = row.split(',')
#count.update(Counter(combinations(row_list,2))) #Counting all the 2 products bundle
count.update(Counter(combinations(row_list,3))) #Counting all the 3 products bundle
count.most_common(10)
# ### Question 5: What Product sold the most? Why do you think it did?
# #### Task 7: Grouping by the product
# In[28]:
product_group = all_data.groupby('Product')
#Visualizing
quantity_ordered = product_group.sum()['Quantity Ordered']
products = [product for product, df in product_group]
plt.bar(products, quantity_ordered)
plt.ylabel('Quantity Ordered')
plt.xlabel('Product')
plt.xticks(products, rotation='vertical', size=8)
plt.show()
# #### Task 8: Overlaying a second y-axis on existing chart
# In[33]:
prices = all_data.groupby('Product').mean()['Price Each']
fig, ax1 = plt.subplots()
ax2 = ax1.twinx()
ax1.bar(products, quantity_ordered, color='g')
ax2.plot(products, prices, 'b-')
ax1.set_xlabel('Product Name')
ax1.set_ylabel('Quantity Ordered', color='g')
ax2.set_ylabel('Price ($)', color = 'b')
ax1.set_xticklabels(products, rotation='vertical', size=8)
plt.show()
# In[ ]: