In [79]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
from datetime import datetime, date, time
import scipy as scipy

%matplotlib inline
sns.set_palette("deep", desat=.6)
sns.set_context(rc={"figure.figsize": (8, 8)})
In [80]:
data = pd.read_csv('train.csv')
In [81]:
data.head()
Out[81]:
datetime season holiday workingday weather temp atemp humidity windspeed casual registered count
0 2011-01-01 00:00:00 1 0 0 1 9.84 14.395 81 0 3 13 16
1 2011-01-01 01:00:00 1 0 0 1 9.02 13.635 80 0 8 32 40
2 2011-01-01 02:00:00 1 0 0 1 9.02 13.635 80 0 5 27 32
3 2011-01-01 03:00:00 1 0 0 1 9.84 14.395 75 0 3 10 13
4 2011-01-01 04:00:00 1 0 0 1 9.84 14.395 75 0 0 1 1

5 rows × 12 columns

In [82]:
fmt = "%Y-%m-%d %H:%M:%S"
def getHour(row):
    thisday = datetime.strptime(row['datetime'], fmt)
    return thisday.hour

def getMonth(row):
    thisday = datetime.strptime(row['datetime'], fmt)
    return thisday.month

def getDay(row):
    thisday = datetime.strptime(row['datetime'], fmt)
    return thisday.weekday()

def getYear(row):
    thisday = datetime.strptime(row['datetime'], fmt)
    return thisday.year

data['hour'] = data.apply(getHour, axis=1)
data['month'] = data.apply(getMonth, axis=1)
data['day'] = data.apply(getDay, axis=1)
data['year'] = data.apply(getYear, axis=1)
workingday = data['workingday'].values
weather = data['weather'].values
temp = data['temp'].values
atemp = data['atemp'].values
hum = data['humidity'].values
season = data['season'].values
holiday = data['holiday'].values
workingday = data['workingday'].values
windspeed = data['windspeed'].values
count = data['count'].values
datetimes = data['datetime'].values
hours = data['hour'].values
casual = data['casual'].values
registered = data['registered'].values
month = data['month'].values
day = data['day'].values
year = data['year'].values

data.head()
Out[82]:
datetime season holiday workingday weather temp atemp humidity windspeed casual registered count hour month day year
0 2011-01-01 00:00:00 1 0 0 1 9.84 14.395 81 0 3 13 16 0 1 5 2011
1 2011-01-01 01:00:00 1 0 0 1 9.02 13.635 80 0 8 32 40 1 1 5 2011
2 2011-01-01 02:00:00 1 0 0 1 9.02 13.635 80 0 5 27 32 2 1 5 2011
3 2011-01-01 03:00:00 1 0 0 1 9.84 14.395 75 0 3 10 13 3 1 5 2011
4 2011-01-01 04:00:00 1 0 0 1 9.84 14.395 75 0 0 1 1 4 1 5 2011

5 rows × 16 columns

In [83]:
from sklearn.cross_validation import cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn import gaussian_process
from sklearn.ensemble.partial_dependence import plot_partial_dependence

X = np.column_stack((holiday, temp, atemp, hum, season, weather, workingday, windspeed, hours, month, day, year))
Y = np.column_stack((casual, count))
Z = np.column_stack((registered, count))
regressor = DecisionTreeRegressor(random_state=0, max_features=1.0, min_samples_split=2)
regressor2 = DecisionTreeRegressor(random_state=0)
scores = cross_val_score(regressor, X, Y, cv=10)
scores2 = cross_val_score(regressor2, X, Z, cv=10)
regressor.fit(X,Y)
regressor2.fit(X,Z)
y0 = regressor.predict(X)
y02 = regressor2.predict(X)
plt.scatter(Y,y0)
Out[83]:
<matplotlib.collections.PathCollection at 0x7f52b047c610>
In [84]:
plt.scatter(Z,y02)
Out[84]:
<matplotlib.collections.PathCollection at 0x7f52b02aaa90>
In [92]:
slope, intercept, r_value2, p_value, std_err = scipy.stats.linregress(Z[:, 0:1], y02[:, 0:1])
r_value2
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-92-59416c9f6bb5> in <module>()
----> 1 slope, intercept, r_value2, p_value, std_err = scipy.stats.linregress(Z[:, 0:1], y02[:, 0:1])
      2 r_value2

/home/nick/anaconda/lib/python2.7/site-packages/scipy/stats/stats.pyc in linregress(x, y)
   3009 
   3010     # average sum of squares:
-> 3011     ssxm, ssxym, ssyxm, ssym = np.cov(x, y, bias=1).flat
   3012     r_num = ssxym
   3013     r_den = np.sqrt(ssxm*ssym)

ValueError: too many values to unpack
In [71]:
slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(Y, y0)
r_value
Out[71]:
0.99998884772495089
In [24]:
data2 = pd.read_csv('test.csv')

data2['hour'] = data2.apply(getHour, axis=1)
data2['month'] = data2.apply(getMonth, axis=1)
data2['day'] = data2.apply(getDay, axis=1)
data2['year'] = data2.apply(getYear, axis=1)
workingdayT = data2['workingday'].values
weatherT = data2['weather'].values
tempT = data2['temp'].values
atempT = data2['atemp'].values
humT = data2['humidity'].values
seasonT = data2['season'].values
holidayT = data2['holiday'].values
workingdayT = data2['workingday'].values
windspeedT = data2['windspeed'].values
datetimesT = data2['datetime'].values
hoursT = data2['hour'].values
monthT = data2['month'].values
dayT = data2['day'].values
yearT = data2['year'].values

X1 = np.column_stack((holidayT, tempT, atempT, humT, seasonT, weatherT, workingdayT, windspeedT, hoursT, monthT, dayT, yearT))

data2.head()
Out[24]:
datetime season holiday workingday weather temp atemp humidity windspeed hour month day year
0 2011-01-20 00:00:00 1 0 1 1 10.66 11.365 56 26.0027 0 1 3 2011
1 2011-01-20 01:00:00 1 0 1 1 10.66 13.635 56 0.0000 1 1 3 2011
2 2011-01-20 02:00:00 1 0 1 1 10.66 13.635 56 0.0000 2 1 3 2011
3 2011-01-20 03:00:00 1 0 1 1 10.66 12.880 56 11.0014 3 1 3 2011
4 2011-01-20 04:00:00 1 0 1 1 10.66 12.880 56 11.0014 4 1 3 2011

5 rows × 13 columns

In [72]:
y1 = regressor.predict(X1)
y2 = regressor2.predict(X1)
sub = pd.DataFrame.from_records(np.column_stack((X1, y1, y2)))
In [76]:
def prepare(row):
    tot = row[12] + row[13]
    #if tot < 0:
    #    return 0
    #else:
    return tot
    
sub['total'] = sub.apply(prepare, axis=1)
In [77]:
sub.head()
Out[77]:
0 1 2 3 4 5 6 7 8 9 10 11 12 13 total
0 0 10.66 11.365 56 1 1 1 26.0027 0 1 3 2011 3 12 15
1 0 10.66 13.635 56 1 1 1 0.0000 1 1 3 2011 0 4 4
2 0 10.66 13.635 56 1 1 1 0.0000 2 1 3 2011 1 1 2
3 0 10.66 12.880 56 1 1 1 11.0014 3 1 3 2011 0 4 4
4 0 10.66 12.880 56 1 1 1 11.0014 4 1 3 2011 0 4 4

5 rows × 15 columns

In [78]:
sub.to_csv("sub4.csv")
In [11]:
from sklearn.externals.six import StringIO  
import pydot
from sklearn import tree
In [12]:
dot_data = StringIO()
tree.export_graphviz(regressor, out_file=dot_data) 
graph = pydot.graph_from_dot_data(dot_data.getvalue()) 
graph.write_pdf("tree.pdf")
graph.
Out[12]:
True
In []:


Comments

comments powered by Disqus