import numpy as np
import pandas as pd
from openai import OpenAI
from pyreal.explainers import (
Explainer,
GlobalFeatureImportance,
LocalFeatureContribution,
SimilarExamples,
)
from pyreal.transformers import run_transformers, sklearn_pipeline_to_pyreal_transformers
from pyreal.utils import get_top_contributors
def format_feature_contribution_output(explanation, ids=None, series=False, optimized=False):
"""
Format Pyreal FeatureContributionExplanation objects into Local Feature Contribution outputs
Args:
explanation (FeatureContributionExplanation):
Pyreal Explanation object to parse
ids (list of strings or ints):
List of row ids
series (Boolean):
If True, the produce function was passed a series input
optimized (Boolean):
If True, return in a simple DataFrame format
Returns:
DataFrame (if series), else {"id" -> DataFrame}
One dataframe per id, with each row representing a feature, and four columns:
Feature Name Feature Value Contribution Average/Mode
if optimized: DataFrame, with one row per instance and one column per feature
"""
if ids is None:
ids = explanation.get().index
if optimized:
return explanation.get().set_index(ids), explanation.get_values().set_index(ids)
average_mode = _get_average_or_mode(explanation.get_values())
explanation_dict = {}
for i, row_id in enumerate(ids):
contributions = explanation.get().iloc[i, :]
values = explanation.get_values().iloc[i, :].loc[contributions.index]
average_mode = average_mode.loc[contributions.index]
feature_names = contributions.index
explanation_dict[row_id] = pd.DataFrame.from_dict(
{
"Feature Name": feature_names.values,
"Feature Value": values.values,
"Contribution": contributions.values,
"Average/Mode": average_mode.values,
}
)
if series:
return explanation_dict[next(iter(explanation_dict))]
return explanation_dict
def format_feature_importance_output(explanation, optimized=False):
"""
Format Pyreal FeatureImportanceExplanation objects into Global Feature Importance outputs
Args:
explanation (FeatureImportanceExplanation):
Pyreal Explanation object to parse
optimized (Boolean):
If True, return in a simple DataFrame format
Returns:
DataFrame with a Feature Name column and an Importance column (if not optimized),
else a single row DataFrame with one column per feature
"""
importances = explanation.get()
if optimized:
return importances
return pd.DataFrame(
{"Feature Name": importances.columns, "Importance": importances.squeeze()}
).reset_index(drop=True)
def format_similar_examples_output(
explanation, ids=None, series=False, y_format_func=None, optimized=False
):
"""
Format Pyreal SimilarExamples objects into Similar Examples outputs
Args:
explanation (SimilarExampleExplanation):
Pyreal Explanation object to parse
ids (list of strings or ints):
List of row ids
series (Boolean):
If True, the produce function was passed a series input
y_format_func (function):
Function to use to format ground truth values
optimized (Boolean)
optimized (Boolean):
Current a no-op, included for consistency
Returns:
{"X": DataFrame, "y": Series, "Input": Series} (if series),
else {"id" -> {"X": DataFrame, "y": Series, "Input": Series}}
X is the examples, ordered from top to bottom by similarity to input and
y is the corresponding y values
Input is the original input in the same feature space
"""
result = {}
if ids is None:
ids = explanation.get_row_ids()
for key, row_id in enumerate(ids):
examples = explanation.get_examples(row_id=key)
targets = explanation.get_targets(row_id=key)
if y_format_func is not None:
targets = targets.apply(y_format_func)
result[row_id] = {
"X": examples,
"y": targets,
"Input": explanation.get_values().iloc[key, :],
}
if series:
return result[next(iter(result))]
return result
def format_narratives(narratives, ids, series=False, optimized=False):
if optimized or series:
return narratives
return {row_id: narr for row_id, narr in zip(ids, narratives)}
def _get_average_or_mode(df):
"""
Gets the average of numeric features and the mode of categorical features
Args:
df (DataFrame):
Input
Returns:
Series
Average or mode of every column in df
"""
s = df.select_dtypes(np.number).mean()
if len(s) == df.shape[1]: # all columns are numeric
return s
return pd.concat((df.drop(s.index, axis=1).mode().iloc[0], s))
[docs]class RealApp:
"""
Maintains all information about a Pyreal application to generate explanations
"""
[docs] def __init__(
self,
models,
X_train_orig=None,
y_train=None,
transformers=None,
feature_descriptions=None,
active_model_id=None,
classes=None,
class_descriptions=None,
pred_format_func=None,
fit_transformers=False,
id_column=None,
openai_api_key=None,
openai_client=None,
context_description="",
):
"""
Initialize a RealApp object
Args:
models (model object, list of models, or dict of model_id:model):
Model(s) for this application
X_train_orig (DataFrame of shape (n_instances,n_features):
Training data for models. If None, must be provided when preparing explainers.
y_train (DataFrame of shape (n_instances,)):
The y values for the dataset
transformers (Transformer object or list of Transformer objects):
Transformers for this application
feature_descriptions (dictionary of feature_name:feature_description):
Mapping of default feature names to readable names
active_model_id (string or int):
ID of model to store as active model, if None, this is set to the first model
classes (array):
List of class names returned by the model, in the order that the internal model
considers them if applicable.
Can be automatically extracted if model is an sklearn classifier
None if model is not a classifier
class_descriptions (dict):
Interpretable descriptions of each class
None if model is not a classifier
pred_format_func (function):
Function to format model prediction outputs
fit_transformers (Boolean):
If True, fit the transformers to X_train_orig on initialization
id_column (string or int):
Name of column that contains item ids in input data
openai_api_key (string):
OpenAI API key. Required for GPT narrative explanations, unless openai client
is provided
openai_client (openai.Client):
OpenAI client object, with API key already set. If provided, openai_api_key is
ignored
context_description (string):
Description of the model's prediction task, in sentence format. This is used by
LLM model for narrative explanations.
For example: "The model predicts the price of houses."
"""
self.expect_model_id = False
if isinstance(models, dict):
self.expect_model_id = True
self.models = models
elif isinstance(models, list):
self.models = {i: models[i] for i in range(0, len(models))}
else: # assume single model given
self.models = {0: models}
if active_model_id is not None:
if active_model_id not in self.models:
raise ValueError("active_model_id not in models")
self.active_model_id = active_model_id
else:
self.active_model_id = next(iter(self.models))
self.id_column = id_column
if (
X_train_orig is not None
and self.id_column is not None
and self.id_column in X_train_orig
):
self.X_train_orig = X_train_orig.drop(columns=self.id_column)
else:
self.X_train_orig = X_train_orig
self.y_train = y_train
self.classes = classes
self.class_descriptions = class_descriptions
self.pred_format_func = pred_format_func
if isinstance(transformers, list):
self.transformers = transformers
else: # assume single transformer given
self.transformers = [transformers]
self.transformers = transformers
self.feature_descriptions = feature_descriptions
if openai_client is not None:
self.openai_client = openai_client
elif openai_api_key is not None:
self.openai_client = OpenAI(api_key=openai_api_key)
else:
self.openai_client = None
if fit_transformers:
# Hacky way of fitting transformers, may want to clean up later
Explainer(
self.models[next(iter(self.models))],
self.X_train_orig,
transformers=self.transformers,
fit_transformers=True,
)
# Base explainer used for general transformations and model predictions
# Also validates data, model, and transformers
self.base_explainers = {
model_id: self._make_base_explainer(self.models[model_id]) for model_id in self.models
}
self.explainers = {} # Dictionary of dictionaries:
# {"explanation_type": {"algorithm":Explainer} }
if context_description is None:
context_description = ""
self.context_description = context_description
def _make_base_explainer(self, model):
"""
Make a base explainer for model.
Args:
model (model object):
The model to be explained by this explainer
Returns:
Explainer
The explainer
"""
return Explainer(
model,
transformers=self.transformers,
feature_descriptions=self.feature_descriptions,
)
def _explainer_exists(self, explanation_type, algorithm):
"""
Check if the requested explainer exists
Args:
explanation_type (string):
Code for explanation_type
algorithm (string):
Name of algorithm
Returns:
Boolean
True if the specified explainer exists, False otherwise
"""
if explanation_type in self.explainers:
if algorithm in self.explainers[explanation_type]:
return True
return False
def _add_explainer(self, explanation_type, algorithm, explainer):
"""
Add the specified explainer to this RealApp
Args:
explanation_type (string):
Code for explanation_type
algorithm (string):
Name of algorithm
explainer (Explainer):
Explainer to add
"""
if explanation_type not in self.explainers:
self.explainers[explanation_type] = {}
self.explainers[explanation_type][algorithm] = explainer
def _get_explainer(self, explanation_type, algorithm=None):
"""
Get the requested explainer
Args:
explanation_type (string):
Code for explanation_type
algorithm (string):
Name of algorithm. If None, return all valid explainer of the requested type.
Returns:
Explainer or False
The requested explainer, of False if not yet fitted
"""
if explanation_type not in self.explainers:
return False
if algorithm is None:
return self.explainers[explanation_type]
if algorithm not in self.explainers[explanation_type]:
return False
return self.explainers[explanation_type][algorithm]
def _produce_explanation_helper(
self,
explanation_type_code,
algorithm,
prepare_explainer_func,
format_output_func,
format_output=True,
x_train_orig=None,
y_train=None,
x_orig=None,
model_id=None,
force_refit=False,
training_size=None,
prepare_kwargs=None,
produce_kwargs=None,
format_kwargs=None,
narrative=False,
):
"""
Produce an explanation from a specified Explainer
Args:
explanation_type_code (string):
Code for explanation_type
algorithm (string):
Name of algorithm
prepare_explainer_func (function):
Function that initializes and fits the appropriate explainer
format_output_func (function):
Function that formats Explanation objects into the appropriate output format
format_output (Boolean):
If False, return output in simple format. Formatted outputs are more usable
but take longer to generate.
x_train_orig (DataFrame of shape (n_instances, n_features)):
Training data, if not provided at initialization.
y_train (DataFrame or Series):
Training targets, if not provided at initialization
x_orig (DataFrame):
Data to explain, required for local explanations
model_id (string or int):
ID of model to explain
force_refit (Boolean):
If True, initialize and fit a new explainer even if the appropriate explainer
already exists
prepare_kwargs (dict):
Additional parameters for explainer init function
produce_kwargs (dict):
Additional parameters for explainer produce function
format_kwargs (dict):
Additional parameters for format function
narrative (Boolean):
If True, use explainer's produce_narrative_explanation() function
Returns:
Type varies by explanation type
The explanation
"""
if model_id is None:
model_id = self.active_model_id
if prepare_kwargs is None:
prepare_kwargs = {}
if produce_kwargs is None:
produce_kwargs = {}
if format_kwargs is None:
format_kwargs = {}
if self._explainer_exists(explanation_type_code, algorithm) and not force_refit:
explainer = self._get_explainer(explanation_type_code, algorithm)
else:
explainer = prepare_explainer_func(
model_id=model_id,
algorithm=algorithm,
x_train_orig=x_train_orig,
y_train=y_train,
training_size=training_size,
**prepare_kwargs
)
if narrative and not hasattr(explainer, "produce_narrative_explanation"):
raise ValueError("narrative explanations not supported for this explainer")
if x_orig is not None:
series = x_orig.ndim == 1
ids = None
if self.id_column is not None and self.id_column in x_orig:
ids = x_orig[self.id_column]
if series: # If x was a series, ids will now be a scaler
ids = [ids]
x_orig = x_orig.drop(self.id_column, axis=x_orig.ndim - 1)
if narrative:
narratives = explainer.produce_narrative_explanation(
x_orig, openai_client=self.openai_client, **produce_kwargs
)
if ids is None:
ids = x_orig.index
return format_narratives(
narratives,
ids=ids,
series=series,
optimized=not format_output,
**format_kwargs
)
else:
explanation = explainer.produce(x_orig, **produce_kwargs)
return format_output_func(
explanation, ids, optimized=not format_output, series=series, **format_kwargs
)
else:
if narrative:
return explainer.produce_narrative_explanation(**produce_kwargs)
else:
explanation = explainer.produce(**produce_kwargs)
return format_output_func(
explanation, optimized=not format_output, **format_kwargs
)
[docs] def add_model(self, model, model_id=None):
"""
Add a model
Args:
model (model object):
Model to add
model_id (string or int):
ID of model. Must be provided when models was originally given as a dictionary. If
none, model ID will be incremented from previous model
"""
if model_id is None:
if self.expect_model_id is True:
raise ValueError(
"Models was originally provided as a dictionary, so you must provide a"
" model_id when adding a model"
)
else:
model_id = len(self.models) + 1
self.models[model_id] = model
[docs] def set_active_model_id(self, active_model_id):
"""
Set a new active model
Args:
active_model_id (int or string):
New model id to set as active model
"""
if active_model_id not in self.models:
raise ValueError("active_model_id not in models")
self.active_model_id = active_model_id
[docs] def get_active_model(self):
"""
Return the active model
Returns:
(model object)
The active model
"""
return self.models[self.active_model_id]
[docs] def predict(self, x, model_id=None, as_dict=None, format=True):
"""
Predict on x using the active model or model specified by model_id
Args:
x (DataFrame of shape (n_instances, n_features) or Series of len n_features):
Data to predict on
model_id (int or string):
Model to use for prediction
as_dict (Boolean):
If False, return predictions as a single Series/List. Otherwise, return
in {row_id: pred} format. Defaults to True if x is a DataFrame, False otherwise
format (Boolean):
If False, do not run the realapp's format function on this output
Returns:
(model return type)
Model prediction on x
"""
if as_dict is None:
as_dict = x.ndim > 1
if self.id_column is not None and self.id_column in x:
ids = x[self.id_column]
x = x.drop(self.id_column, axis=x.ndim - 1)
else:
ids = x.index
if model_id is None:
model_id = self.active_model_id
preds = self.base_explainers[model_id].model_predict(x)
if not as_dict:
if format and self.pred_format_func is not None:
return [self.pred_format_func(pred) for pred in preds]
return preds
preds_dict = {}
for i, row_id in enumerate(ids):
if format and self.pred_format_func is not None:
preds_dict[row_id] = self.pred_format_func(preds[i])
else:
preds_dict[row_id] = preds[i]
return preds_dict
[docs] def predict_proba(self, x, model_id=None, as_dict=None, format=True):
"""
Return the predicted probabilities of x using the active model or
model specified by model_id, only if the model has a predict_proba method
Args:
x (DataFrame of shape (n_instances, n_features) or Series of len n_features):
Data to predict on
model_id (int or string):
Model to use for prediction
as_dict (Boolean):
If False, return predictions as a single Series/List. Otherwise, return
in {row_id: pred} format. Defaults to True if x is a DataFrame, False otherwise
format (Boolean):
If False, do not run the realapp's format function on this output
Returns:
(model return type)
Model prediction on x in terms of probability
"""
if as_dict is None:
as_dict = x.ndim > 1
if self.id_column is not None and self.id_column in x:
ids = x[self.id_column]
x = x.drop(self.id_column, axis=x.ndim - 1)
else:
ids = x.index
if model_id is None:
model_id = self.active_model_id
preds = self.base_explainers[model_id].model_predict_proba(x)
if not as_dict:
if format and self.pred_format_func is not None:
return [self.pred_format_func(pred) for pred in preds]
return preds
preds_dict = {}
for i, row_id in enumerate(ids):
if format and self.pred_format_func is not None:
preds_dict[row_id] = self.pred_format_func(preds[i])
else:
preds_dict[row_id] = preds[i]
return preds_dict
[docs] def prepare_feature_contributions(
self,
x_train_orig=None,
y_train=None,
model_id=None,
algorithm=None,
shap_type=None,
training_size=None,
):
"""
Initialize and fit a local feature contribution explainer
Args:
model_id (int or string):
Model id to explain
x_train_orig (DataFrame of shape (n_instances, n_features)):
Training data, if not provided at initialization.
y_train (DataFrame or Series):
Training targets, if not provided at initialization
algorithm (string):
LFC algorithm to use
shap_type (string):
If algorithm is "shap", type of shap to use
training_size (int):
Number of rows to use in fitting explainer
Returns:
A fit LocalFeatureContribution explainer
"""
if algorithm is None:
algorithm = "shap"
if model_id is None:
model_id = self.active_model_id
explainer = LocalFeatureContribution(
self.models[model_id],
transformers=self.transformers,
feature_descriptions=self.feature_descriptions,
e_algorithm=algorithm,
shap_type=shap_type,
classes=self.classes,
class_descriptions=self.class_descriptions,
training_size=training_size,
openai_client=self.openai_client,
)
explainer.fit(self._get_x_train_orig(x_train_orig), self._get_y_train(y_train))
self._add_explainer("lfc", algorithm, explainer)
return explainer
[docs] def produce_feature_contributions(
self,
x_orig,
model_id=None,
x_train_orig=None,
y_train=None,
algorithm=None,
format_output=True,
shap_type=None,
force_refit=False,
training_size=None,
num_features=None,
select_by="absolute",
):
"""
Produce a feature contribution explanation
Args:
x_orig (DataFrame of shape (n_instances, n_features) or Series of length (n_features)):
Input(s) to explain
model_id (string or int):
ID of model to explain
x_train_orig (DataFrame):
Data to fit on, if not provided during initialization
y_train (DataFrame or Series):
Training targets to fit on, if not provided during initialization
algorithm (string):
Name of algorithm
format_output (Boolean):
If False, return output as a single DataFrame. Formatted outputs are more usable
but take longer to generate.
shap_type (string):
If algorithm="shap", type of SHAP explainer to use
force_refit (Boolean):
If True, initialize and fit a new explainer even if the appropriate explainer
already exists
training_size (int):
Number of rows to use in fitting explainer
num_features (int):
Number of features to include in the explanation. If None, include all features
select_by (one of "absolute", "min", "max"):
If `num_features` is not None, method to use for selecting which features to show.
Not used if num_features is None
Returns:
dictionary (if x_orig is DataFrame) or DataFrame (if x_orig is Series)
One dataframe per id, with each row representing a feature, and four columns:
Feature Name Feature Value Contribution Average/Mode
"""
if algorithm is None:
algorithm = "shap"
exp = self._produce_explanation_helper(
"lfc",
algorithm,
self.prepare_feature_contributions,
format_feature_contribution_output,
x_train_orig=x_train_orig,
y_train=y_train,
x_orig=x_orig,
format_output=format_output,
model_id=model_id,
force_refit=force_refit,
training_size=training_size,
prepare_kwargs={"shap_type": shap_type},
)
if num_features is not None:
return {
row_id: get_top_contributors(
exp[row_id], num_features=num_features, select_by=select_by
)
for row_id in exp
}
else:
return exp
[docs] def produce_narrative_feature_contributions(
self,
x_orig,
model_id=None,
x_train_orig=None,
y_train=None,
algorithm=None,
shap_type=None,
force_refit=False,
training_size=None,
format_output=True,
num_features=5,
select_by="absolute",
llm_model="gpt3.5",
detail_level="high",
context_description=None,
max_tokens=200,
temperature=0.5,
):
"""
Produce a feature contribution explanation, formatted in natural language sentence
format using LLMs.
Args:
x_orig (DataFrame of shape (n_instances, n_features) or Series of length (n_features)):
Input(s) to explain
model_id (string or int):
ID of model to explain
x_train_orig (DataFrame):
Data to fit on, if not provided during initialization
y_train (DataFrame or Series):
Training targets to fit on, if not provided during initialization
algorithm (string):
Name of algorithm
shap_type (string):
If algorithm="shap", type of SHAP explainer to use
force_refit (Boolean):
If True, initialize and fit a new explainer even if the appropriate explainer
already exists
training_size (int):
Number of rows to use in fitting explainer
format_output (bool):
If False, return output as a single list of narratives. Formatted outputs are more
usable, but formatting may slow down runtimes on larger inputs
num_features (int):
Number of features to include in the explanation. If None, include all features
select_by (one of "absolute", "min", "max"):
If `num_features` is not None, method to use for selecting which features to show.
Not used if num_features is None
llm_model (string):
One of ["gpt3.5", "gpt4"]. LLM model to use to generate the explanation.
GPT4 may provide better results, but is more expensive.
detail_level (string):
One of ["high", "low"]. Level of detail to include in the explanation.
High detail should include precise contribution values. Low detail
will include only basic information about features used.
context_description (string):
Description of the model's prediction task, in sentence format. This will be
passed to the LLM and may help produce more accurate explanations.
For example: "The model predicts the price of houses."
max_tokens (int):
Maximum number of tokens to use in the explanation
temperature (float):
LLM Temperature to use. Values closer to 1 will produce more creative values.
Values closer to 0 will produce more consistent or conservative explanations.
Returns:
dictionary (if x_orig is DataFrame) or DataFrame (if x_orig is Series)
One dataframe per id, with each row representing a feature, and four columns:
Feature Name Feature Value Contribution Average/Mode
"""
if algorithm is None:
algorithm = "shap"
if context_description is None:
context_description = self.context_description
exp = self._produce_explanation_helper(
"lfc",
algorithm,
self.prepare_feature_contributions,
format_feature_contribution_output,
x_train_orig=x_train_orig,
y_train=y_train,
x_orig=x_orig,
model_id=model_id,
force_refit=force_refit,
training_size=training_size,
format_output=format_output,
prepare_kwargs={"shap_type": shap_type},
produce_kwargs={
"llm_model": llm_model,
"detail_level": detail_level,
"max_tokens": max_tokens,
"temperature": temperature,
"num_features": num_features,
"context_description": context_description,
},
narrative=True,
)
return exp
[docs] def prepare_feature_importance(
self,
x_train_orig=None,
y_train=None,
model_id=None,
algorithm=None,
shap_type=None,
training_size=None,
):
"""
Initialize and fit a global feature importance explainer
Args:
model_id (int or string):
Model id to explain
x_train_orig (DataFrame of shape (n_instances, n_features)):
Training data, if not provided at initialization.
y_train (DataFrame or Series):
Training targets, if not provided at initialization
algorithm (string):
GFI algorithm to use
shap_type (string):
If algorithm is "shap", type of shap to use
training_size (int):
Number of rows to use in fitting explainer
Returns:
A fit GlobalFeatureImportance explainer
"""
if algorithm is None:
algorithm = "shap"
if model_id is None:
model_id = self.active_model_id
explainer = GlobalFeatureImportance(
self.models[model_id],
transformers=self.transformers,
feature_descriptions=self.feature_descriptions,
e_algorithm=algorithm,
classes=self.classes,
class_descriptions=self.class_descriptions,
shap_type=shap_type,
training_size=training_size,
)
explainer.fit(self._get_x_train_orig(x_train_orig), self._get_y_train(y_train))
self._add_explainer("gfi", algorithm, explainer)
return explainer
[docs] def produce_feature_importance(
self,
model_id=None,
x_train_orig=None,
y_train=None,
algorithm=None,
format_output=True,
shap_type=None,
force_refit=False,
training_size=None,
num_features=None,
select_by="absolute",
):
"""
Produce a GlobalFeatureImportance explainer
Args:
model_id (string or int):
ID of model to explain
x_train_orig (DataFrame):
Data to fit on, if not provided during initialization
y_train (DataFrame or Series):
Training targets to fit on, if not provided during initialization
algorithm (string):
Name of algorithm
format_output (Boolean):
If False, return output as a single DataFrame. Formatted outputs are more usable
but take longer to generate.
shap_type (string):
If algorithm="shap", type of SHAP explainer to use
force_refit (Boolean):
If True, initialize and fit a new explainer even if the appropriate explainer
already exists
training_size (int):
Number of rows to use in fitting explainer
num_features (int):
Number of features to include in the explanation. If None, include all features
select_by (one of "absolute", "min", "max"):
If `num_features` is not None, method to use for selecting which features to show.
Not used if num_features is None
Returns:
DataFrame with a Feature Name column and an Importance column
"""
if algorithm is None:
algorithm = "shap"
exp = self._produce_explanation_helper(
"gfi",
algorithm,
self.prepare_feature_importance,
format_feature_importance_output,
model_id=model_id,
x_train_orig=x_train_orig,
y_train=y_train,
format_output=format_output,
force_refit=force_refit,
training_size=training_size,
prepare_kwargs={"shap_type": shap_type},
)
if num_features is not None:
return get_top_contributors(exp, num_features=num_features, select_by=select_by)
else:
return exp
[docs] def prepare_similar_examples(
self,
x_train_orig=None,
y_train=None,
model_id=None,
algorithm=None,
training_size=None,
standardize=False,
fast=True,
):
"""
Initialize and fit a nearest neighbor explainer
Args:
model_id (int or string):
Model id to explain
x_train_orig (DataFrame of shape (n_instances, n_features)):
Training data, if not provided at initialization.
y_train (DataFrame or Series):
Training targets, if not provided at initialization
algorithm (string):
NN algorithm to use (current options: [nn])
training_size (int):
Number of rows to use in fitting explainer
standardize (Boolean):
If True, standardize data before using it to get similar examples.
Recommended if model-ready data is not already standardized
fast (Boolean):
If True, use a faster algorithm for getting similar examples (disable if faiss
dependency not available)
Returns:
A fit SimilarExamples explainer
"""
if algorithm is None:
algorithm = "nn"
if model_id is None:
model_id = self.active_model_id
explainer = SimilarExamples(
self.models[model_id],
transformers=self.transformers,
feature_descriptions=self.feature_descriptions,
e_algorithm=algorithm,
classes=self.classes,
class_descriptions=self.class_descriptions,
training_size=training_size,
standardize=standardize,
fast=fast,
)
explainer.fit(self._get_x_train_orig(x_train_orig), self._get_y_train(y_train))
self._add_explainer("se", algorithm, explainer)
return explainer
[docs] def produce_similar_examples(
self,
x_orig,
model_id=None,
x_train_orig=None,
y_train=None,
format_output=True,
num_examples=3,
standardize=False,
fast=True,
format_y=True,
algorithm=None,
force_refit=False,
):
"""
Produce a SimilarExamples explainer
Args:
x_orig (DataFrame):
Input to explain
model_id (string or int):
ID of model to explain
x_train_orig (DataFrame):
Data to fit on, if not provided during initialization
y_train (DataFrame or Series):
Training targets to fit on, if not provided during initialization
format_output (Boolean):
No functionality, included for consistency
num_examples (int):
Number of similar examples to return
standardize (Boolean):
If True, standardize data before using it to get similar examples.
Recommended if model-ready data is not already standardized
fast (Boolean):
If True, use a faster algorithm for generating similar examples. Disable if
faiss is not available
format_y (Boolean):
If True, format the ground truth y values returned using self.pred_format_func
algorithm (string):
Name of algorithm
force_refit (Boolean):
If True, initialize and fit a new explainer even if the appropriate explainer
already exists
Returns:
{"X": DataFrame, "y": Series, "Input": Series} (if series),
else {"id" -> {"X": DataFrame, "y": Series, "Input": Series}}
X is the examples, ordered from top to bottom by similarity to input and
y is the corresponding y values
Input is the original input in the same feature space
"""
if algorithm is None:
algorithm = "nn"
format_kwargs = dict()
if format_y:
format_kwargs["y_format_func"] = self.pred_format_func
return self._produce_explanation_helper(
"se",
algorithm,
self.prepare_similar_examples,
format_similar_examples_output,
x_orig=x_orig,
model_id=model_id,
x_train_orig=x_train_orig,
y_train=y_train,
format_output=format_output,
force_refit=force_refit,
prepare_kwargs={"standardize": standardize, "fast": fast},
produce_kwargs={"num_examples": num_examples},
format_kwargs=format_kwargs,
)
[docs] def train_feature_contribution_llm(
self, x_train_orig=None, live=True, provide_examples=False, num_inputs=5, num_features=3
):
"""
Run the training process for the LLM model used to generate narrative feature
contribution explanations.
Args:
x_train_orig (DataFrame of shape (n_instances, n_features)):
Training set to take sample inputs from. If None, the training set must be provided
to the explainer at initialization.
live (Boolean):
If True, run the training process through CLI input/outputs. If False,
this function will generate a shell training file that will need to be filled out
and added to the RealApp manually. Currently only live training is supported.
provide_examples (Boolean):
If True, generate a base example of explanations at each step. This may make
the process faster, but will incur costs to your OpenAI API account.
num_inputs (int):
Number of inputs to request.
num_features (int):
Number of features to include per explanation. If None, all features will be
included
Returns:
list of (explanation, narrative) pairs
The generated training data
"""
lfc_explainers = self._get_explainer("lfc")
if not lfc_explainers:
self.prepare_feature_contributions(x_train_orig=x_train_orig, algorithm="shap")
lfc_explainers = self._get_explainer("lfc")
training_data = None
for i, algorithm in enumerate(lfc_explainers):
if i == 0:
training_data = lfc_explainers[algorithm].train_llm(
x_train=self._get_x_train_orig(x_train_orig),
live=live,
provide_examples=provide_examples,
num_inputs=num_inputs,
num_features=num_features,
)
else:
lfc_explainers[algorithm].set_llm_training_data(training_data=training_data)
def set_openai_client(self, openai_client=None, openai_api_key=None):
"""
Set the openai client for this RealApp.
One of openai_client or openai_api_key must be provided.
Args:
openai_client (openai.Client):
OpenAI client object, with API key already set. If provided, openai_api_key is
ignored
openai_api_key (string):
OpenAI API key. If provided, create a new API client.
"""
if openai_client is not None:
self.openai_client = openai_client
elif openai_api_key is not None:
self.openai_client = OpenAI(api_key=openai_api_key)
else:
raise ValueError("Must provide openai_client or openai_api_key")
[docs] @staticmethod
def from_sklearn(
pipeline=None,
model=None,
transformers=None,
X_train=None,
y_train=None,
refit_model=True,
verbose=0,
**kwargs
):
"""
Create a RealApp from a sklearn pipeline or model and transformers.
Must provide one of:
- just pipeline
- just model
- model and transformers
Args:
pipeline (sklearn.pipeline.Pipeline):
Sklearn pipeline to convert. The final step of the pipeline must be a model.
model (sklearn model):
Sklearn model to use. Ignored if pipeline is not None
transformers (list of sklearn transformers):
List of sklearn transformers to use. Ignored if pipeline is not None
X_train (DataFrame):
Training data to fit transformers and explanations to. May be required if
transformers are not fitted or must be recreated. If not provided, must be
provided when preparing and using realapp explainers.
y_train (DataFrame or Series):
Training targets to fit transformers and explanations to. If not provided, must be
provided when preparing and using realapp explainers.
refit_model (bool):
If True, refit the model using the new Pyreal transformers. This may be necessary
as sklearn and Pyreal transformers may result in an unaligned column order.
Requires X_train and y_train to be provided.
verbose (int):
Verbosity level. If 0, no output. If 1, detailed output
**kwargs:
Additional arguments to pass to RealApp constructor.
Returns:
RealApp
Newly created RealApp object
"""
if pipeline is None and model is None and transformers is None:
raise ValueError("Must provide either pipeline or model")
if pipeline is not None and not hasattr(pipeline, "steps"):
raise ValueError("pipeline must be a valid sklearn pipeline")
pyreal_transformers = []
if pipeline is not None:
model = pipeline.steps[-1][1]
pyreal_transformers = sklearn_pipeline_to_pyreal_transformers(
pipeline, X_train, verbose=verbose
)
elif transformers is not None:
pyreal_transformers = sklearn_pipeline_to_pyreal_transformers(
transformers, X_train, verbose=verbose
)
if refit_model:
if X_train is None or y_train is None:
raise ValueError("X_train and y_train must be provided to refit the model")
model.fit(run_transformers(pyreal_transformers, X_train), y_train)
return RealApp(
models=model,
transformers=pyreal_transformers,
X_train_orig=X_train,
y_train=y_train,
**kwargs
)
def _get_x_train_orig(self, x_train_orig):
"""
Helper function to get the appropriate x_orig or raise errors if something goes wrong
Args:
x_train_orig (DataFrame or None):
Provided DataFrame
Returns:
The dataframe to use (x_orig or self.x_train_orig), may be None if neither is given
"""
if x_train_orig is not None:
if self.id_column is not None and self.id_column in x_train_orig:
return x_train_orig.drop(columns=self.id_column)
return x_train_orig
else:
return self.X_train_orig
def _get_y_train(self, y_train):
"""
Helper function to get the appropriate y or raise errors if something goes wrong
Args:
y (DataFrame or None):
Provided DataFrame
Returns:
The dataframe to use (y or self.y_train), may be None if neither is given
"""
if y_train is not None:
return y_train
else:
return self.y_train