# Import the necessary libraries
import numpy as np
import pandas as pd
# Generate random data
= 100
n = np.random.rand(n)
x = 2*x + np.random.normal(size=n)
y
# Create a DataFrame and save to CSV
= pd.DataFrame({'x': x, 'y': y})
df 'your_dataset.csv', index=False) df.to_csv(
OLS regression
OLS regression is a method for estimating the parameters of a linear regression model. The goal is to find the line that best fits a set of data points. The line is represented by an equation of the form y = mx + b
where :
- y is the dependent variable,
- x is the independent variable,
- m is the slope of the line, and
- b is the y-intercept.
Generate random data
import numpy as np
from numpy import ndarray
import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt
# Step 1: Load the data and split into independent and dependent variables
= pd.read_csv('your_dataset.csv')
data = data.iloc[:, :-1].values
X = data.iloc[:, -1].values
y
# add a column of 1s to the X matrix for the intercept term
= np.append(arr=np.ones((len(X), 1)), values=X, axis=1)
X
# calculate the coefficients using the OLS formula
= np.linalg.inv(X.T.dot(X)).dot(X.T).dot(y)
beta
= X.dot(beta) y_pred
def mean_squared_error(y_true:ndarray, y_pred:ndarray):
= len(y_true)
n = sum([(y_true[i] - y_pred[i])**2 for i in range(n)]) / n
mse return mse
def r2_score(y_true:ndarray, y_pred:ndarray):
= sum([(y_true[i] - y_pred[i])**2 for i in range(len(y_true))])
ssr = sum([(y_true[i] - np.mean(y_true))**2 for i in range(len(y_true))])
sst = 1 - (ssr / sst)
r2 return r2
= np.sqrt(mean_squared_error(y, y_pred))
rmse = r2_score(y, y_pred)
r2
print("RMSE: ", rmse)
print("R-squared: ", r2)
RMSE: 1.0862489887741527
R-squared: 0.29675681489500483
1], y, color='blue')
plt.scatter(X[:, 1], y_pred, color='red')
plt.plot(X[:, 'OLS Regression')
plt.title('Independent variable')
plt.xlabel('Dependent variable')
plt.ylabel( plt.show()
Text(0.5, 1.0, 'OLS Regression')
Text(0.5, 0, 'Independent variable')
Text(0, 0.5, 'Dependent variable')
Citation
BibTeX citation:
@online{bochman2023,
author = {Bochman, Oren},
title = {OLS Regression {From} {Scratch}},
date = {2023-02-01},
url = {https://orenbochman.github.io/posts/2023/2023-02-01-ds-from-scratch/ols-regression-from-scratch.html},
langid = {en}
}
For attribution, please cite this work as:
Bochman, Oren. 2023. “OLS Regression From Scratch.”
February 1, 2023. https://orenbochman.github.io/posts/2023/2023-02-01-ds-from-scratch/ols-regression-from-scratch.html.