-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdata_analysis.py
116 lines (86 loc) · 2.75 KB
/
data_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
# -*- coding: utf-8 -*-
"""
Created on Wed Feb 15 00:00:24 2017
@author: Lily
"""
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
df = pd.read_csv('book_data_2.csv')
#clearfy the data
df1 = None
describe = None
def DataCleaning():
global df1
df1 = df.dropna(axis=0,how='any')
#delete:0 instead of nan, useless
print df1[df1['WEB_PRICE'].isin([0.0])]
df1 = df1.drop([207,402,673,725])
return df1
#describe the data
def DescribeData():
d_p = df1['PRICE'].describe()
d_wp = df1['WEB_PRICE'].describe()
d_s = df1['SCORE'].describe()
d_pl = df1['PL'].describe()
d_wp = df1['WEB_PRICE'].describe()
global describe
describe = pd.DataFrame({'PRICE':d_p,
'PL':d_pl,
'WEB_PRICE':d_wp,
'SCORE':d_s})
describe.to_csv('books_describe.csv')
#box plots
def BoxPlots():
plt.figure()
df1['SCORE'].plot.box(legend='Score')
plt.figure()
df1['PL'].plot.box(legend='PL')
plt.figure()
df1['WEB_PRICE'].plot.box(legend='Web_price')
plt.figure()
df1['PRICE'].plot.box(legend='Price')
#Histograms
def Histograms():
plt.figure()
ax1 = df1['SCORE'].hist(alpha=0.8,bins=50,figsize=(6,4))
ax1.legend(['Score'])
plt.figure()
df1['SCORE'].diff().hist(alpha=0.8,bins=50,figsize=(6,4))
plt.figure()
ax2 = df1['PRICE'].hist(alpha=0.8,bins=50,figsize=(6,4))
ax2.legend(['Price'])
plt.figure()
df1['PRICE'].diff().hist(alpha=0.8,bins=50,figsize=(6,4))
plt.figure()
ax3 = df1['WEB_PRICE'].hist(alpha=0.8,bins=50,figsize=(6,4))
ax3.legend(['Web_price'])
plt.figure()
df1['WEB_PRICE'].diff().hist(alpha=0.8,bins=50,figsize=(6,4))
plt.figure()
ax4 = df1['PL'].hist(alpha=0.8,bins=50,figsize=(6,4))
ax4.legend(['Pl'])
plt.figure()
df1['PL'].diff().hist(alpha=0.8,bins=50,figsize=(6,4))
#regression analysis
def RegressionAnalysis():
print df1.corr()
sns.lmplot('SCORE','PRICE', df1)
sns.lmplot('SCORE','WEB_PRICE',df1)
sns.lmplot('PL','SCORE',df1)
result1= sm.OLS(df1['SCORE'], df1['PL']).fit()
print "The result of Score_Pl regression analysis is:"
print result1.summary()
result2 = sm.OLS(df1['PRICE'], df1['SCORE']).fit()
print "The result of Price-Score regression analysis is:"
print result2.summary()
result3 = sm.OLS(df1['WEB_PRICE'], df1['SCORE']).fit()
print "The result of Web_price-Score regression analysis is:"
print result3.summary()
#Run
DataCleaning()
DescribeData()
BoxPlots()
Histograms()
RegressionAnalysis()