import pandas as pd
import numpy as np
df = pd.read_csv("cancer_reg.csv", sep=',', encoding='gbk')
df=df.drop(["Geography"],axis=1)
df=df.drop(["binnedInc"],axis=1)
df['PctSomeCol18_24'].fillna(df['PctSomeCol18_24'].mean(), inplace=True)
df['PctEmployed16_Over'].fillna(df['PctEmployed16_Over'].mean(), inplace=True)
df['PctPrivateCoverageAlone'].fillna(df['PctPrivateCoverageAlone'].mean(), inplace=True)
X=df.iloc[:, df.columns != "TARGET_deathRate"].values
y=df.loc[:,"TARGET_deathRate"].values
y.reshape(-1,1)
X.shape
from sklearn.model_selection import train_test_split
X_train,y_train,X_test,y_test=train_test_split(X,y , test_size=0.25, random_state=0)
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.fit_transform(X_test)
scaler.fit(y_train)
y_train = scaler.transform(y_train)
y_test = scaler.fit_transform(y_test)
from sklearn.linear_model import LinearRegression
lr=LinearRegression()
lr.fit(X_train,y_train)
ValueError: Found input variables with inconsistent numbers of samples: [2285, 762]
how to solve this error