# 机器学习简易入门（一） – 线性回归

`   import pandas  sp500 = pandas.read_csv("sp500.csv") `

`   sp500 = sp500[sp500['value'] != '.'] `

`   next_day = sp500["value"].iloc[1:]  sp500 = sp500.iloc[:-1,:] # 去掉最后一行 sp500["next_day"] = next_day.values `

`   # 原始的数据格式  print(sp500.dtypes) `

`   sp500['value'] = sp500['value'].astype(float)  sp500['next_day'] = sp500['next_day'].astype(float)  # 转换后的数据格式  print(sp500.dtypes) `

`   #导入类  from sklearn.linear_model import LinearRegression  # 初始化  regressor = LinearRegression()  # predictors变量需要是一个dataframe，而不能是一个series  predictors = sp500[["value"]] # 这是一个dataframe  to_predict = sp500["next_day"] # 这是一个series  # 训练这个线性回归模型  regressor.fit(predictors, to_predict)  # 根据模型生成预测值  next_day_predictions = regressor.predict(predictors)  print(next_day_predictions) `

`   mse = sum((to_predict - next_day_predictions) ** 2) / len(next_day_predictions) `

`   import math  rmse = math.sqrt(sum((predictions - test["next_day"]) ** 2) / len(predictions))  mae = sum(abs(predictions - test["next_day"])) / len(predictions) `

`   import numpy as np  import random  np.random.seed(1)  random.seed(1)  #将sp500进行随机重排  sp500 = sp500.loc[np.random.permutation(sp500.index)] `

`   # 选择前70%的数据作为训练数据  highest_train_row = int(sp500.shape[0] * .7)  train = sp500.loc[:highest_train_row,:]  #选择后30%的数据作为测试数据  test = sp500.loc[highest_train_row:,:]  regressor = LinearRegression()  predictors = train[['value']]  to_predict = train['next_day']  regressor.fit(predictors, to_predict)  next_day_predictions = regressor.predict(test[['value']])  mse = sum((next_day_predictions - test['next_day']) ** 2) / len(next_day_predictions) `

`   import matplotlib.pyplot as plt  plt.scatter(test['value'], test['next_day'])  plt.plot(test['value'], predictions)  plt.show() `