Implementing linear regression in Python¶

Example: 2007 West Coast Ocean Acidification Cruise¶

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy import stats
import statsmodels.formula.api as smf
import pingouin as pg

import PyCO2SYS as pyco2

/Users/tomconnolly/programs/miniconda3/envs/data-book/lib/python3.8/site-packages/outdated/utils.py:14: OutdatedPackageWarning: The package pingouin is out of date. Your version is 0.5.0, the latest is 0.5.1.
Set the environment variable OUTDATED_IGNORE=1 to disable these warnings.
  return warn(

Load data¶

Load 2007 data¶

filename07 = 'wcoa_cruise_2007/32WC20070511.exc.csv'
df07 = pd.read_csv(filename07,header=29,na_values=-999,parse_dates=[[6,7]])

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
/var/folders/67/t9t76vyd78qgn_kwjr1_76yr0000gn/T/ipykernel_74292/425823264.py in <module>
      1 filename07 = 'wcoa_cruise_2007/32WC20070511.exc.csv'
----> 2 df07 = pd.read_csv(filename07,header=29,na_values=-999,parse_dates=[[6,7]])

~/programs/miniconda3/envs/data-book/lib/python3.8/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
    309                     stacklevel=stacklevel,
    310                 )
--> 311             return func(*args, **kwargs)
    312 
    313         return wrapper

~/programs/miniconda3/envs/data-book/lib/python3.8/site-packages/pandas/io/parsers/readers.py in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)
    584     kwds.update(kwds_defaults)
    585 
--> 586     return _read(filepath_or_buffer, kwds)
    587 
    588 

~/programs/miniconda3/envs/data-book/lib/python3.8/site-packages/pandas/io/parsers/readers.py in _read(filepath_or_buffer, kwds)
    480 
    481     # Create the parser.
--> 482     parser = TextFileReader(filepath_or_buffer, **kwds)
    483 
    484     if chunksize or iterator:

~/programs/miniconda3/envs/data-book/lib/python3.8/site-packages/pandas/io/parsers/readers.py in __init__(self, f, engine, **kwds)
    809             self.options["has_index_names"] = kwds["has_index_names"]
    810 
--> 811         self._engine = self._make_engine(self.engine)
    812 
    813     def close(self):

~/programs/miniconda3/envs/data-book/lib/python3.8/site-packages/pandas/io/parsers/readers.py in _make_engine(self, engine)
   1038             )
   1039         # error: Too many arguments for "ParserBase"
-> 1040         return mapping[engine](self.f, **self.options)  # type: ignore[call-arg]
   1041 
   1042     def _failover_to_python(self):

~/programs/miniconda3/envs/data-book/lib/python3.8/site-packages/pandas/io/parsers/c_parser_wrapper.py in __init__(self, src, **kwds)
     49 
     50         # open handles
---> 51         self._open_handles(src, kwds)
     52         assert self.handles is not None
     53 

~/programs/miniconda3/envs/data-book/lib/python3.8/site-packages/pandas/io/parsers/base_parser.py in _open_handles(self, src, kwds)
    220         Let the readers open IOHandles after they are done with their potential raises.
    221         """
--> 222         self.handles = get_handle(
    223             src,
    224             "r",

~/programs/miniconda3/envs/data-book/lib/python3.8/site-packages/pandas/io/common.py in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
    700         if ioargs.encoding and "b" not in ioargs.mode:
    701             # Encoding
--> 702             handle = open(
    703                 handle,
    704                 ioargs.mode,

FileNotFoundError: [Errno 2] No such file or directory: 'wcoa_cruise_2007/32WC20070511.exc.csv'

df07.keys()

Index(['DATE_TIME', 'EXPOCODE', 'SECT_ID', 'SAMPNO', 'LINE', 'STNNBR',
       'CASTNO', 'LATITUDE', 'LONGITUDE', 'BOT_DEPTH', 'BTLNBR',
       'BTLNBR_FLAG_W', 'CTDPRS', 'CTDTMP', 'CTDSAL', 'CTDSAL_FLAG_W',
       'SALNTY', 'SALNTY_FLAG_W', 'CTDOXY', 'CTDOXY_FLAG_W', 'OXYGEN',
       'OXYGEN_FLAG_W', 'SILCAT', 'SILCAT_FLAG_W', 'NITRAT', 'NITRAT_FLAG_W',
       'NITRIT', 'NITRIT_FLAG_W', 'PHSPHT', 'PHSPHT_FLAG_W', 'TCARBN',
       'TCARBN_FLAG_W', 'ALKALI', 'ALKALI_FLAG_W'],
      dtype='object')

Linear regression: four methods in Python¶

Create \(x\) and \(y\) variables.

x = df07['PHSPHT']
y = df07['NITRAT']

Plot data.

plt.figure()
plt.plot(x,y,'.')
plt.xlabel('phosphate')
plt.ylabel('nitrate')

Text(0, 0.5, 'nitrate')

_images/week05b-wcoa-cruise-regression_9_1.png

Create a subset where both variables have finite values.

ii = np.isfinite(x+y)

ii

     True
     True
     True
     True
     True
        ... 
  True
  True
  True
  True
  True
Length: 2348, dtype: bool

Method 1: Scipy¶

result = stats.linregress(x[ii],y[ii])

result

LinregressResult(slope=14.740034517902107, intercept=-3.9325720551997954, rvalue=0.9860645445968036, pvalue=0.0, stderr=0.05292378356970245, intercept_stderr=0.10258209230911518)

result.slope

14.740034517902107

Exercise: Draw the regression line with the data

plt.figure()
plt.plot(x,y,'k.')
plt.plot(x,result.slope*x+result.intercept,'r-')

[<matplotlib.lines.Line2D at 0x7f8be85239d0>]

_images/week05b-wcoa-cruise-regression_18_1.png

Method 2: statsmodels¶

Ordinary least squares fit using statsmodels.

smres = smf.ols('NITRAT ~ PHSPHT',df07).fit()

smres.summary()

OLS Regression Results
Dep. Variable:	NITRAT	R-squared:	0.972
Model:	OLS	Adj. R-squared:	0.972
Method:	Least Squares	F-statistic:	7.757e+04
Date:	Tue, 22 Feb 2022	Prob (F-statistic):	0.00
Time:	21:22:25	Log-Likelihood:	-4993.9
No. Observations:	2210	AIC:	9992.
Df Residuals:	2208	BIC:	1.000e+04
Df Model:	1
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	-3.9326	0.103	-38.336	0.000	-4.134	-3.731
PHSPHT	14.7400	0.053	278.514	0.000	14.636	14.844

Omnibus:	874.728	Durbin-Watson:	0.269
Prob(Omnibus):	0.000	Jarque-Bera (JB):	5147.310
Skew:	-1.766	Prob(JB):	0.00
Kurtosis:	9.589	Cond. No.	4.90

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

Method 3: pingouin¶

pg.linear_regression(x[ii],y[ii])

	names	coef	se	T	pval	r2	adj_r2	CI[2.5%]	CI[97.5%]
0	Intercept	-3.932572	0.102582	-38.335853	6.540950e-247	0.972323	0.972311	-4.133740	-3.731405
1	PHSPHT	14.740035	0.052924	278.514375	0.000000e+00	0.972323	0.972311	14.636249	14.843820

Data Analysis Techniques in Marine Science

Implementing linear regression in Python

Contents