Iris Dataset
In [1]:
Copied!
from pypekit import Task
import pandas as pd
class IrisLoader(Task):
input_types = ["source"]
output_types = ["raw"]
def run(self, _):
from sklearn.datasets import load_iris
iris = load_iris()
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
iris_df['target'] = iris.target
return iris_df
class TrainTestSplitter(Task):
input_types = ["raw"]
output_types = ["split"]
def run(self, df):
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=0)
train_df['train'] = 1
test_df['train'] = 0
df = pd.concat([train_df, test_df], ignore_index=True)
return df
class Scaler(Task):
input_types = ["split"]
output_types = ["processed"]
def run(self, df):
X = df.drop(columns=['target', 'train'])
X_train = X[df['train'] == 1]
scaler = self.get_scaler()
scaler.fit(X_train)
X_scaled = scaler.transform(X)
scaled_df = pd.DataFrame(data=X_scaled, columns=X.columns)
scaled_df['target'] = df['target']
scaled_df['train'] = df['train']
return scaled_df
def get_scaler(self):
raise NotImplementedError("Subclasses should implement this method.")
class MinMaxScaler(Scaler):
def get_scaler(self):
from sklearn.preprocessing import MinMaxScaler
return MinMaxScaler()
class StandardScaler(Scaler):
def get_scaler(self):
from sklearn.preprocessing import StandardScaler
return StandardScaler()
class PCA(Task):
input_types = ["split", "processed"]
output_types = ["processed"]
def __init__(self, **kwargs):
self.kwargs = kwargs
def run(self, df):
X = df.drop(columns=['target', 'train'])
X_train = X[df['train'] == 1]
from sklearn.decomposition import PCA
pca = PCA(**self.kwargs)
pca.fit(X_train)
X_pca = pca.transform(X)
pca_df = pd.DataFrame(data=X_pca, columns=[
f'PC[i+1]' for i in range(X_pca.shape[1])])
pca_df['target'] = df['target']
pca_df['train'] = df['train']
return pca_df
class Classifier(Task):
input_types = ["split", "processed"]
output_types = ["predicted"]
def run(self, df):
X = df.drop(columns=['target', 'train'])
y = df['target']
X_train = X[df['train'] == 1]
y_train = y[df['train'] == 1]
classifier = self.get_classifier()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X)
df['predicted'] = y_pred
return df
def get_scaler(self):
raise NotImplementedError("Subclasses should implement this method.")
class LogisticRegression(Classifier):
def __init__(self, **kwargs):
self.kwargs = kwargs
def get_classifier(self):
from sklearn.linear_model import LogisticRegression
return LogisticRegression(**self.kwargs)
class RandomForestClassifier(Classifier):
def __init__(self, **kwargs):
self.kwargs = kwargs
def get_classifier(self):
from sklearn.ensemble import RandomForestClassifier
return RandomForestClassifier(**self.kwargs)
class SVC(Classifier):
def __init__(self, **kwargs):
self.kwargs = kwargs
def get_classifier(self):
from sklearn.svm import SVC
return SVC(**self.kwargs)
class Evaluator(Task):
input_types = ["predicted"]
output_types = ["sink"]
def run(self, df):
df_test = df[df['train'] == 0]
return (df_test['target'] == df_test['predicted']).mean()
from pypekit import Task
import pandas as pd
class IrisLoader(Task):
input_types = ["source"]
output_types = ["raw"]
def run(self, _):
from sklearn.datasets import load_iris
iris = load_iris()
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
iris_df['target'] = iris.target
return iris_df
class TrainTestSplitter(Task):
input_types = ["raw"]
output_types = ["split"]
def run(self, df):
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=0)
train_df['train'] = 1
test_df['train'] = 0
df = pd.concat([train_df, test_df], ignore_index=True)
return df
class Scaler(Task):
input_types = ["split"]
output_types = ["processed"]
def run(self, df):
X = df.drop(columns=['target', 'train'])
X_train = X[df['train'] == 1]
scaler = self.get_scaler()
scaler.fit(X_train)
X_scaled = scaler.transform(X)
scaled_df = pd.DataFrame(data=X_scaled, columns=X.columns)
scaled_df['target'] = df['target']
scaled_df['train'] = df['train']
return scaled_df
def get_scaler(self):
raise NotImplementedError("Subclasses should implement this method.")
class MinMaxScaler(Scaler):
def get_scaler(self):
from sklearn.preprocessing import MinMaxScaler
return MinMaxScaler()
class StandardScaler(Scaler):
def get_scaler(self):
from sklearn.preprocessing import StandardScaler
return StandardScaler()
class PCA(Task):
input_types = ["split", "processed"]
output_types = ["processed"]
def __init__(self, **kwargs):
self.kwargs = kwargs
def run(self, df):
X = df.drop(columns=['target', 'train'])
X_train = X[df['train'] == 1]
from sklearn.decomposition import PCA
pca = PCA(**self.kwargs)
pca.fit(X_train)
X_pca = pca.transform(X)
pca_df = pd.DataFrame(data=X_pca, columns=[
f'PC[i+1]' for i in range(X_pca.shape[1])])
pca_df['target'] = df['target']
pca_df['train'] = df['train']
return pca_df
class Classifier(Task):
input_types = ["split", "processed"]
output_types = ["predicted"]
def run(self, df):
X = df.drop(columns=['target', 'train'])
y = df['target']
X_train = X[df['train'] == 1]
y_train = y[df['train'] == 1]
classifier = self.get_classifier()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X)
df['predicted'] = y_pred
return df
def get_scaler(self):
raise NotImplementedError("Subclasses should implement this method.")
class LogisticRegression(Classifier):
def __init__(self, **kwargs):
self.kwargs = kwargs
def get_classifier(self):
from sklearn.linear_model import LogisticRegression
return LogisticRegression(**self.kwargs)
class RandomForestClassifier(Classifier):
def __init__(self, **kwargs):
self.kwargs = kwargs
def get_classifier(self):
from sklearn.ensemble import RandomForestClassifier
return RandomForestClassifier(**self.kwargs)
class SVC(Classifier):
def __init__(self, **kwargs):
self.kwargs = kwargs
def get_classifier(self):
from sklearn.svm import SVC
return SVC(**self.kwargs)
class Evaluator(Task):
input_types = ["predicted"]
output_types = ["sink"]
def run(self, df):
df_test = df[df['train'] == 0]
return (df_test['target'] == df_test['predicted']).mean()
In [2]:
Copied!
from pypekit import Repository, CachedExecutor
repository = Repository([
IrisLoader,
TrainTestSplitter,
MinMaxScaler,
StandardScaler,
PCA,
LogisticRegression,
RandomForestClassifier,
SVC,
Evaluator
])
repository.build_tree()
print(repository.build_tree_string())
from pypekit import Repository, CachedExecutor
repository = Repository([
IrisLoader,
TrainTestSplitter,
MinMaxScaler,
StandardScaler,
PCA,
LogisticRegression,
RandomForestClassifier,
SVC,
Evaluator
])
repository.build_tree()
print(repository.build_tree_string())
└── Root()
└── IrisLoader()
└── TrainTestSplitter()
├── MinMaxScaler()
│ ├── PCA()
│ │ ├── LogisticRegression()
│ │ │ └── Evaluator()
│ │ ├── RandomForestClassifier()
│ │ │ └── Evaluator()
│ │ └── SVC()
│ │ └── Evaluator()
│ ├── LogisticRegression()
│ │ └── Evaluator()
│ ├── RandomForestClassifier()
│ │ └── Evaluator()
│ └── SVC()
│ └── Evaluator()
├── StandardScaler()
│ ├── PCA()
│ │ ├── LogisticRegression()
│ │ │ └── Evaluator()
│ │ ├── RandomForestClassifier()
│ │ │ └── Evaluator()
│ │ └── SVC()
│ │ └── Evaluator()
│ ├── LogisticRegression()
│ │ └── Evaluator()
│ ├── RandomForestClassifier()
│ │ └── Evaluator()
│ └── SVC()
│ └── Evaluator()
├── PCA()
│ ├── LogisticRegression()
│ │ └── Evaluator()
│ ├── RandomForestClassifier()
│ │ └── Evaluator()
│ └── SVC()
│ └── Evaluator()
├── LogisticRegression()
│ └── Evaluator()
├── RandomForestClassifier()
│ └── Evaluator()
└── SVC()
└── Evaluator()
In [3]:
Copied!
pipelines = repository.build_pipelines()
executor = CachedExecutor(pipelines, verbose=True)
results = executor.run()
pipelines = repository.build_pipelines()
executor = CachedExecutor(pipelines, verbose=True)
results = executor.run()
Pipeline 1/18 completed. Runtime: 0.49s. Pipeline 2/18 completed. Runtime: 0.57s. Pipeline 3/18 completed. Runtime: 0.47s. Pipeline 4/18 completed. Runtime: 0.45s. Pipeline 5/18 completed. Runtime: 0.51s. Pipeline 6/18 completed. Runtime: 0.45s. Pipeline 7/18 completed. Runtime: 0.45s. Pipeline 8/18 completed. Runtime: 0.53s. Pipeline 9/18 completed. Runtime: 0.45s. Pipeline 10/18 completed. Runtime: 0.45s. Pipeline 11/18 completed. Runtime: 0.54s. Pipeline 12/18 completed. Runtime: 0.45s. Pipeline 13/18 completed. Runtime: 0.45s. Pipeline 14/18 completed. Runtime: 0.54s. Pipeline 15/18 completed. Runtime: 0.45s. Pipeline 16/18 completed. Runtime: 0.45s. Pipeline 17/18 completed. Runtime: 0.52s. Pipeline 18/18 completed. Runtime: 0.45s.
In [4]:
Copied!
import json
for r in results.values():
print(json.dumps(r, indent=2))
import json
for r in results.values():
print(json.dumps(r, indent=2))
{
"output": 0.9,
"runtime": 0.48643693100075325,
"tasks": [
"IrisLoader()",
"TrainTestSplitter()",
"MinMaxScaler()",
"PCA()",
"LogisticRegression()",
"Evaluator()"
]
}
{
"output": 0.9666666666666667,
"runtime": 0.5735052290001477,
"tasks": [
"IrisLoader()",
"TrainTestSplitter()",
"MinMaxScaler()",
"PCA()",
"RandomForestClassifier()",
"Evaluator()"
]
}
{
"output": 0.9666666666666667,
"runtime": 0.4717829759997585,
"tasks": [
"IrisLoader()",
"TrainTestSplitter()",
"MinMaxScaler()",
"PCA()",
"SVC()",
"Evaluator()"
]
}
{
"output": 0.9,
"runtime": 0.4498820519997935,
"tasks": [
"IrisLoader()",
"TrainTestSplitter()",
"MinMaxScaler()",
"LogisticRegression()",
"Evaluator()"
]
}
{
"output": 1.0,
"runtime": 0.514511486000174,
"tasks": [
"IrisLoader()",
"TrainTestSplitter()",
"MinMaxScaler()",
"RandomForestClassifier()",
"Evaluator()"
]
}
{
"output": 1.0,
"runtime": 0.44840691199988214,
"tasks": [
"IrisLoader()",
"TrainTestSplitter()",
"MinMaxScaler()",
"SVC()",
"Evaluator()"
]
}
{
"output": 1.0,
"runtime": 0.45150064299969017,
"tasks": [
"IrisLoader()",
"TrainTestSplitter()",
"StandardScaler()",
"PCA()",
"LogisticRegression()",
"Evaluator()"
]
}
{
"output": 1.0,
"runtime": 0.5293152959993677,
"tasks": [
"IrisLoader()",
"TrainTestSplitter()",
"StandardScaler()",
"PCA()",
"RandomForestClassifier()",
"Evaluator()"
]
}
{
"output": 1.0,
"runtime": 0.44972267999992255,
"tasks": [
"IrisLoader()",
"TrainTestSplitter()",
"StandardScaler()",
"PCA()",
"SVC()",
"Evaluator()"
]
}
{
"output": 1.0,
"runtime": 0.44954042600102184,
"tasks": [
"IrisLoader()",
"TrainTestSplitter()",
"StandardScaler()",
"LogisticRegression()",
"Evaluator()"
]
}
{
"output": 1.0,
"runtime": 0.5382957270003317,
"tasks": [
"IrisLoader()",
"TrainTestSplitter()",
"StandardScaler()",
"RandomForestClassifier()",
"Evaluator()"
]
}
{
"output": 1.0,
"runtime": 0.4474906720006402,
"tasks": [
"IrisLoader()",
"TrainTestSplitter()",
"StandardScaler()",
"SVC()",
"Evaluator()"
]
}
{
"output": 1.0,
"runtime": 0.44923665600072127,
"tasks": [
"IrisLoader()",
"TrainTestSplitter()",
"PCA()",
"LogisticRegression()",
"Evaluator()"
]
}
{
"output": 1.0,
"runtime": 0.5380921020005189,
"tasks": [
"IrisLoader()",
"TrainTestSplitter()",
"PCA()",
"RandomForestClassifier()",
"Evaluator()"
]
}
{
"output": 1.0,
"runtime": 0.4494958460009002,
"tasks": [
"IrisLoader()",
"TrainTestSplitter()",
"PCA()",
"SVC()",
"Evaluator()"
]
}
{
"output": 1.0,
"runtime": 0.4539837970000917,
"tasks": [
"IrisLoader()",
"TrainTestSplitter()",
"LogisticRegression()",
"Evaluator()"
]
}
{
"output": 1.0,
"runtime": 0.5246418990000166,
"tasks": [
"IrisLoader()",
"TrainTestSplitter()",
"RandomForestClassifier()",
"Evaluator()"
]
}
{
"output": 1.0,
"runtime": 0.44512994800015804,
"tasks": [
"IrisLoader()",
"TrainTestSplitter()",
"SVC()",
"Evaluator()"
]
}