# 数据科学与机器学习管道中预处理的重要性（二）：中心化、缩放和逻辑回归

## Python回归简介

### 线性回归Python实现

``# Import necessary packages import pandas as pd %matplotlib inline import matplotlib.pyplot as plt plt.style.use('ggplot') from sklearn import datasets from sklearn import linear_model import numpy as np # Load data boston = datasets.load_boston() yb = boston.target.reshape(-1, 1) Xb = boston['data'][:,5].reshape(-1, 1) # Plot data plt.scatter(Xb,yb) plt.ylabel('value of house /1000 (\$)') plt.xlabel('number of rooms') # Create linear regression object regr = linear_model.LinearRegression() # Train the model using the training sets regr.fit( Xb, yb) # Plot outputs plt.scatter(Xb, yb,  color='black') plt.plot(Xb, regr.predict(Xb), color='blue',          linewidth=3) plt.show()``

### 逻辑回归Python实现

``# Synthesize data X1 = np.random.normal(size=150) y1 = (X1 > 0).astype(np.float) X1[X1 > 0] *= 4 X1 += .3 * np.random.normal(size=150) X1= X1.reshape(-1, 1) # Run the classifier clf = linear_model.LogisticRegression() clf.fit(X1, y1) # Plot the result plt.scatter(X1.ravel(), y1, color='black', zorder=20 , alpha = 0.5) plt.plot(X1_ordered, clf.predict_proba(X1_ordered)[:,1], color='blue' , linewidth = 3) plt.ylabel('target variable') plt.xlabel('predictor variable') plt.show()``

## 逻辑回归和数据缩放：红酒数据集

``# Import necessary modules from sklearn import linear_model from sklearn.cross_validation import train_test_split # Load data df = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv ' , sep = ';') X = df.drop('quality' , 1).values #drop target variable y1 = df['quality'].values y = y1 <= 5 # is the rating <= 5? # plot histograms of original target variable # and aggregated target variable plt.figure(figsize=(20,5)); plt.subplot(1, 2, 1 ); plt.hist(y1); plt.xlabel('original target value') plt.ylabel('count') plt.subplot(1, 2, 2); plt.hist(y) plt.xlabel('aggregated target value') plt.show()``

``# Split the data into test and training sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) #initial logistic regression model lr = linear_model.LogisticRegression() # fit the model lr = lr.fit(X_train, y_train) print('Logistic Regression score for training set: %f' % lr.score(X_train, y_train)) from sklearn.metrics import classification_report y_true, y_pred = y_test, lr.predict(X_test) print(classification_report(y_true, y_pred))``
``Logistic Regression score for training set: 0.752932              precision    recall  f1-score   support        False       0.78      0.74      0.76       179        True       0.69      0.74      0.71       141  avg / total       0.74      0.74      0.74       320``

``from sklearn.preprocessing import scale Xs = scale(X) Xs_train, Xs_test, y_train, y_test = train_test_split(Xs, y, test_size=0.2, random_state=42) lr_2 = lr.fit(Xs_train, y_train) print('Scaled Logistic Regression score for test set: %f' % lr_2.score(Xs_test, y_test)) y_true, y_pred = y_test, lr_2.predict(Xs_test) print(classification_report(y_true, y_pred))``
``Scaled Logistic Regression score for test set: 0.740625              precision    recall  f1-score   support        False       0.79      0.74      0.76       179        True       0.69      0.74      0.72       141  avg / total       0.74      0.74      0.74       320``

``# Set sc = True if you want to scale your features sc = False  # Load data df = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv ' , sep = ';') X = df.drop('quality' , 1).values # drop target variable  # Here we scale, if desired if sc == True: X = scale(X)  # Target value y1 = df['quality'].values # original target variable y = y1 <= 5 # new target variable: is the rating <= 5?  # Split the data into a test set and a training set X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  # Train logistic regression model and print performance on the test set lr = linear_model.LogisticRegression() lr = lr.fit(X_train, y_train) print('Logistic Regression score for training set: %f' % lr.score(X_train, y_train)) y_true, y_pred = y_test, lr.predict(X_test) print(classification_report(y_true, y_pred))``
``<script.py> output:     Logistic Regression score for training set: 0.752932                  precision    recall  f1-score   support            False       0.78      0.74      0.76       179            True       0.69      0.74      0.71       141      avg / total       0.74      0.74      0.74       320``

### 术语表

K近邻（k-Nearest Neighbors）：分类任务的一种算法，一个数据点的标签由离它最近的k个质心投票决定。