Source code for pyreal.realapp.realapp

import numpy as np
import pandas as pd
from openai import OpenAI

from pyreal.explainers import (
    Explainer,
    GlobalFeatureImportance,
    LocalFeatureContribution,
    SimilarExamples,
)
from pyreal.transformers import run_transformers, sklearn_pipeline_to_pyreal_transformers
from pyreal.utils import get_top_contributors


def format_feature_contribution_output(explanation, ids=None, series=False, optimized=False):
    """
    Format Pyreal FeatureContributionExplanation objects into Local Feature Contribution outputs
    Args:
        explanation (FeatureContributionExplanation):
            Pyreal Explanation object to parse
        ids (list of strings or ints):
            List of row ids
        series (Boolean):
            If True, the produce function was passed a series input
        optimized (Boolean):
            If True, return in a simple DataFrame format

    Returns:
        DataFrame (if series), else {"id" -> DataFrame}
            One dataframe per id, with each row representing a feature, and four columns:
                Feature Name    Feature Value   Contribution    Average/Mode
        if optimized: DataFrame, with one row per instance and one column per feature
    """
    if ids is None:
        ids = explanation.get().index
    if optimized:
        return explanation.get().set_index(ids), explanation.get_values().set_index(ids)
    average_mode = _get_average_or_mode(explanation.get_values())
    explanation_dict = {}
    for i, row_id in enumerate(ids):
        contributions = explanation.get().iloc[i, :]
        values = explanation.get_values().iloc[i, :].loc[contributions.index]
        average_mode = average_mode.loc[contributions.index]

        feature_names = contributions.index

        explanation_dict[row_id] = pd.DataFrame.from_dict(
            {
                "Feature Name": feature_names.values,
                "Feature Value": values.values,
                "Contribution": contributions.values,
                "Average/Mode": average_mode.values,
            }
        )
    if series:
        return explanation_dict[next(iter(explanation_dict))]
    return explanation_dict


def format_feature_importance_output(explanation, optimized=False):
    """
    Format Pyreal FeatureImportanceExplanation objects into Global Feature Importance outputs
    Args:
        explanation (FeatureImportanceExplanation):
            Pyreal Explanation object to parse
        optimized (Boolean):
            If True, return in a simple DataFrame format

    Returns:
        DataFrame with a Feature Name column and an Importance column (if not optimized),
        else a single row DataFrame with one column per feature
    """
    importances = explanation.get()
    if optimized:
        return importances
    return pd.DataFrame(
        {"Feature Name": importances.columns, "Importance": importances.squeeze()}
    ).reset_index(drop=True)


def format_similar_examples_output(
    explanation, ids=None, series=False, y_format_func=None, optimized=False
):
    """
    Format Pyreal SimilarExamples objects into Similar Examples outputs
    Args:
        explanation (SimilarExampleExplanation):
            Pyreal Explanation object to parse
        ids (list of strings or ints):
            List of row ids
        series (Boolean):
            If True, the produce function was passed a series input
        y_format_func (function):
            Function to use to format ground truth values
                optimized (Boolean)
        optimized (Boolean):
            Current a no-op, included for consistency

    Returns:
        {"X": DataFrame, "y": Series, "Input": Series} (if series),
                else {"id" -> {"X": DataFrame, "y": Series, "Input": Series}}
            X is the examples, ordered from top to bottom by similarity to input and
            y is the corresponding y values
            Input is the original input in the same feature space
    """
    result = {}
    if ids is None:
        ids = explanation.get_row_ids()
    for key, row_id in enumerate(ids):
        examples = explanation.get_examples(row_id=key)
        targets = explanation.get_targets(row_id=key)
        if y_format_func is not None:
            targets = targets.apply(y_format_func)
        result[row_id] = {
            "X": examples,
            "y": targets,
            "Input": explanation.get_values().iloc[key, :],
        }
    if series:
        return result[next(iter(result))]
    return result


def format_narratives(narratives, ids, series=False, optimized=False):
    if optimized or series:
        return narratives
    return {row_id: narr for row_id, narr in zip(ids, narratives)}


def _get_average_or_mode(df):
    """
    Gets the average of numeric features and the mode of categorical features

    Args:
        df (DataFrame):
            Input
    Returns:
        Series
            Average or mode of every column in df
    """
    s = df.select_dtypes(np.number).mean()
    if len(s) == df.shape[1]:  # all columns are numeric
        return s
    return pd.concat((df.drop(s.index, axis=1).mode().iloc[0], s))


[docs]class RealApp: """ Maintains all information about a Pyreal application to generate explanations """
[docs] def __init__( self, models, X_train_orig=None, y_train=None, transformers=None, feature_descriptions=None, active_model_id=None, classes=None, class_descriptions=None, pred_format_func=None, fit_transformers=False, id_column=None, openai_api_key=None, openai_client=None, context_description="", ): """ Initialize a RealApp object Args: models (model object, list of models, or dict of model_id:model): Model(s) for this application X_train_orig (DataFrame of shape (n_instances,n_features): Training data for models. If None, must be provided when preparing explainers. y_train (DataFrame of shape (n_instances,)): The y values for the dataset transformers (Transformer object or list of Transformer objects): Transformers for this application feature_descriptions (dictionary of feature_name:feature_description): Mapping of default feature names to readable names active_model_id (string or int): ID of model to store as active model, if None, this is set to the first model classes (array): List of class names returned by the model, in the order that the internal model considers them if applicable. Can be automatically extracted if model is an sklearn classifier None if model is not a classifier class_descriptions (dict): Interpretable descriptions of each class None if model is not a classifier pred_format_func (function): Function to format model prediction outputs fit_transformers (Boolean): If True, fit the transformers to X_train_orig on initialization id_column (string or int): Name of column that contains item ids in input data openai_api_key (string): OpenAI API key. Required for GPT narrative explanations, unless openai client is provided openai_client (openai.Client): OpenAI client object, with API key already set. If provided, openai_api_key is ignored context_description (string): Description of the model's prediction task, in sentence format. This is used by LLM model for narrative explanations. For example: "The model predicts the price of houses." """ self.expect_model_id = False if isinstance(models, dict): self.expect_model_id = True self.models = models elif isinstance(models, list): self.models = {i: models[i] for i in range(0, len(models))} else: # assume single model given self.models = {0: models} if active_model_id is not None: if active_model_id not in self.models: raise ValueError("active_model_id not in models") self.active_model_id = active_model_id else: self.active_model_id = next(iter(self.models)) self.id_column = id_column if ( X_train_orig is not None and self.id_column is not None and self.id_column in X_train_orig ): self.X_train_orig = X_train_orig.drop(columns=self.id_column) else: self.X_train_orig = X_train_orig self.y_train = y_train self.classes = classes self.class_descriptions = class_descriptions self.pred_format_func = pred_format_func if isinstance(transformers, list): self.transformers = transformers else: # assume single transformer given self.transformers = [transformers] self.transformers = transformers self.feature_descriptions = feature_descriptions if openai_client is not None: self.openai_client = openai_client elif openai_api_key is not None: self.openai_client = OpenAI(api_key=openai_api_key) else: self.openai_client = None if fit_transformers: # Hacky way of fitting transformers, may want to clean up later Explainer( self.models[next(iter(self.models))], self.X_train_orig, transformers=self.transformers, fit_transformers=True, ) # Base explainer used for general transformations and model predictions # Also validates data, model, and transformers self.base_explainers = { model_id: self._make_base_explainer(self.models[model_id]) for model_id in self.models } self.explainers = {} # Dictionary of dictionaries: # {"explanation_type": {"algorithm":Explainer} } if context_description is None: context_description = "" self.context_description = context_description
def _make_base_explainer(self, model): """ Make a base explainer for model. Args: model (model object): The model to be explained by this explainer Returns: Explainer The explainer """ return Explainer( model, transformers=self.transformers, feature_descriptions=self.feature_descriptions, ) def _explainer_exists(self, explanation_type, algorithm): """ Check if the requested explainer exists Args: explanation_type (string): Code for explanation_type algorithm (string): Name of algorithm Returns: Boolean True if the specified explainer exists, False otherwise """ if explanation_type in self.explainers: if algorithm in self.explainers[explanation_type]: return True return False def _add_explainer(self, explanation_type, algorithm, explainer): """ Add the specified explainer to this RealApp Args: explanation_type (string): Code for explanation_type algorithm (string): Name of algorithm explainer (Explainer): Explainer to add """ if explanation_type not in self.explainers: self.explainers[explanation_type] = {} self.explainers[explanation_type][algorithm] = explainer def _get_explainer(self, explanation_type, algorithm=None): """ Get the requested explainer Args: explanation_type (string): Code for explanation_type algorithm (string): Name of algorithm. If None, return all valid explainer of the requested type. Returns: Explainer or False The requested explainer, of False if not yet fitted """ if explanation_type not in self.explainers: return False if algorithm is None: return self.explainers[explanation_type] if algorithm not in self.explainers[explanation_type]: return False return self.explainers[explanation_type][algorithm] def _produce_explanation_helper( self, explanation_type_code, algorithm, prepare_explainer_func, format_output_func, format_output=True, x_train_orig=None, y_train=None, x_orig=None, model_id=None, force_refit=False, training_size=None, prepare_kwargs=None, produce_kwargs=None, format_kwargs=None, narrative=False, ): """ Produce an explanation from a specified Explainer Args: explanation_type_code (string): Code for explanation_type algorithm (string): Name of algorithm prepare_explainer_func (function): Function that initializes and fits the appropriate explainer format_output_func (function): Function that formats Explanation objects into the appropriate output format format_output (Boolean): If False, return output in simple format. Formatted outputs are more usable but take longer to generate. x_train_orig (DataFrame of shape (n_instances, n_features)): Training data, if not provided at initialization. y_train (DataFrame or Series): Training targets, if not provided at initialization x_orig (DataFrame): Data to explain, required for local explanations model_id (string or int): ID of model to explain force_refit (Boolean): If True, initialize and fit a new explainer even if the appropriate explainer already exists prepare_kwargs (dict): Additional parameters for explainer init function produce_kwargs (dict): Additional parameters for explainer produce function format_kwargs (dict): Additional parameters for format function narrative (Boolean): If True, use explainer's produce_narrative_explanation() function Returns: Type varies by explanation type The explanation """ if model_id is None: model_id = self.active_model_id if prepare_kwargs is None: prepare_kwargs = {} if produce_kwargs is None: produce_kwargs = {} if format_kwargs is None: format_kwargs = {} if self._explainer_exists(explanation_type_code, algorithm) and not force_refit: explainer = self._get_explainer(explanation_type_code, algorithm) else: explainer = prepare_explainer_func( model_id=model_id, algorithm=algorithm, x_train_orig=x_train_orig, y_train=y_train, training_size=training_size, **prepare_kwargs ) if narrative and not hasattr(explainer, "produce_narrative_explanation"): raise ValueError("narrative explanations not supported for this explainer") if x_orig is not None: series = x_orig.ndim == 1 ids = None if self.id_column is not None and self.id_column in x_orig: ids = x_orig[self.id_column] if series: # If x was a series, ids will now be a scaler ids = [ids] x_orig = x_orig.drop(self.id_column, axis=x_orig.ndim - 1) if narrative: narratives = explainer.produce_narrative_explanation( x_orig, openai_client=self.openai_client, **produce_kwargs ) if ids is None: ids = x_orig.index return format_narratives( narratives, ids=ids, series=series, optimized=not format_output, **format_kwargs ) else: explanation = explainer.produce(x_orig, **produce_kwargs) return format_output_func( explanation, ids, optimized=not format_output, series=series, **format_kwargs ) else: if narrative: return explainer.produce_narrative_explanation(**produce_kwargs) else: explanation = explainer.produce(**produce_kwargs) return format_output_func( explanation, optimized=not format_output, **format_kwargs )
[docs] def add_model(self, model, model_id=None): """ Add a model Args: model (model object): Model to add model_id (string or int): ID of model. Must be provided when models was originally given as a dictionary. If none, model ID will be incremented from previous model """ if model_id is None: if self.expect_model_id is True: raise ValueError( "Models was originally provided as a dictionary, so you must provide a" " model_id when adding a model" ) else: model_id = len(self.models) + 1 self.models[model_id] = model
[docs] def set_active_model_id(self, active_model_id): """ Set a new active model Args: active_model_id (int or string): New model id to set as active model """ if active_model_id not in self.models: raise ValueError("active_model_id not in models") self.active_model_id = active_model_id
[docs] def get_active_model(self): """ Return the active model Returns: (model object) The active model """ return self.models[self.active_model_id]
[docs] def predict(self, x, model_id=None, as_dict=None, format=True): """ Predict on x using the active model or model specified by model_id Args: x (DataFrame of shape (n_instances, n_features) or Series of len n_features): Data to predict on model_id (int or string): Model to use for prediction as_dict (Boolean): If False, return predictions as a single Series/List. Otherwise, return in {row_id: pred} format. Defaults to True if x is a DataFrame, False otherwise format (Boolean): If False, do not run the realapp's format function on this output Returns: (model return type) Model prediction on x """ if as_dict is None: as_dict = x.ndim > 1 if self.id_column is not None and self.id_column in x: ids = x[self.id_column] x = x.drop(self.id_column, axis=x.ndim - 1) else: ids = x.index if model_id is None: model_id = self.active_model_id preds = self.base_explainers[model_id].model_predict(x) if not as_dict: if format and self.pred_format_func is not None: return [self.pred_format_func(pred) for pred in preds] return preds preds_dict = {} for i, row_id in enumerate(ids): if format and self.pred_format_func is not None: preds_dict[row_id] = self.pred_format_func(preds[i]) else: preds_dict[row_id] = preds[i] return preds_dict
[docs] def predict_proba(self, x, model_id=None, as_dict=None, format=True): """ Return the predicted probabilities of x using the active model or model specified by model_id, only if the model has a predict_proba method Args: x (DataFrame of shape (n_instances, n_features) or Series of len n_features): Data to predict on model_id (int or string): Model to use for prediction as_dict (Boolean): If False, return predictions as a single Series/List. Otherwise, return in {row_id: pred} format. Defaults to True if x is a DataFrame, False otherwise format (Boolean): If False, do not run the realapp's format function on this output Returns: (model return type) Model prediction on x in terms of probability """ if as_dict is None: as_dict = x.ndim > 1 if self.id_column is not None and self.id_column in x: ids = x[self.id_column] x = x.drop(self.id_column, axis=x.ndim - 1) else: ids = x.index if model_id is None: model_id = self.active_model_id preds = self.base_explainers[model_id].model_predict_proba(x) if not as_dict: if format and self.pred_format_func is not None: return [self.pred_format_func(pred) for pred in preds] return preds preds_dict = {} for i, row_id in enumerate(ids): if format and self.pred_format_func is not None: preds_dict[row_id] = self.pred_format_func(preds[i]) else: preds_dict[row_id] = preds[i] return preds_dict
[docs] def prepare_feature_contributions( self, x_train_orig=None, y_train=None, model_id=None, algorithm=None, shap_type=None, training_size=None, ): """ Initialize and fit a local feature contribution explainer Args: model_id (int or string): Model id to explain x_train_orig (DataFrame of shape (n_instances, n_features)): Training data, if not provided at initialization. y_train (DataFrame or Series): Training targets, if not provided at initialization algorithm (string): LFC algorithm to use shap_type (string): If algorithm is "shap", type of shap to use training_size (int): Number of rows to use in fitting explainer Returns: A fit LocalFeatureContribution explainer """ if algorithm is None: algorithm = "shap" if model_id is None: model_id = self.active_model_id explainer = LocalFeatureContribution( self.models[model_id], transformers=self.transformers, feature_descriptions=self.feature_descriptions, e_algorithm=algorithm, shap_type=shap_type, classes=self.classes, class_descriptions=self.class_descriptions, training_size=training_size, openai_client=self.openai_client, ) explainer.fit(self._get_x_train_orig(x_train_orig), self._get_y_train(y_train)) self._add_explainer("lfc", algorithm, explainer) return explainer
[docs] def produce_feature_contributions( self, x_orig, model_id=None, x_train_orig=None, y_train=None, algorithm=None, format_output=True, shap_type=None, force_refit=False, training_size=None, num_features=None, select_by="absolute", ): """ Produce a feature contribution explanation Args: x_orig (DataFrame of shape (n_instances, n_features) or Series of length (n_features)): Input(s) to explain model_id (string or int): ID of model to explain x_train_orig (DataFrame): Data to fit on, if not provided during initialization y_train (DataFrame or Series): Training targets to fit on, if not provided during initialization algorithm (string): Name of algorithm format_output (Boolean): If False, return output as a single DataFrame. Formatted outputs are more usable but take longer to generate. shap_type (string): If algorithm="shap", type of SHAP explainer to use force_refit (Boolean): If True, initialize and fit a new explainer even if the appropriate explainer already exists training_size (int): Number of rows to use in fitting explainer num_features (int): Number of features to include in the explanation. If None, include all features select_by (one of "absolute", "min", "max"): If `num_features` is not None, method to use for selecting which features to show. Not used if num_features is None Returns: dictionary (if x_orig is DataFrame) or DataFrame (if x_orig is Series) One dataframe per id, with each row representing a feature, and four columns: Feature Name Feature Value Contribution Average/Mode """ if algorithm is None: algorithm = "shap" exp = self._produce_explanation_helper( "lfc", algorithm, self.prepare_feature_contributions, format_feature_contribution_output, x_train_orig=x_train_orig, y_train=y_train, x_orig=x_orig, format_output=format_output, model_id=model_id, force_refit=force_refit, training_size=training_size, prepare_kwargs={"shap_type": shap_type}, ) if num_features is not None: return { row_id: get_top_contributors( exp[row_id], num_features=num_features, select_by=select_by ) for row_id in exp } else: return exp
[docs] def produce_narrative_feature_contributions( self, x_orig, model_id=None, x_train_orig=None, y_train=None, algorithm=None, shap_type=None, force_refit=False, training_size=None, format_output=True, num_features=5, select_by="absolute", llm_model="gpt3.5", detail_level="high", context_description=None, max_tokens=200, temperature=0.5, ): """ Produce a feature contribution explanation, formatted in natural language sentence format using LLMs. Args: x_orig (DataFrame of shape (n_instances, n_features) or Series of length (n_features)): Input(s) to explain model_id (string or int): ID of model to explain x_train_orig (DataFrame): Data to fit on, if not provided during initialization y_train (DataFrame or Series): Training targets to fit on, if not provided during initialization algorithm (string): Name of algorithm shap_type (string): If algorithm="shap", type of SHAP explainer to use force_refit (Boolean): If True, initialize and fit a new explainer even if the appropriate explainer already exists training_size (int): Number of rows to use in fitting explainer format_output (bool): If False, return output as a single list of narratives. Formatted outputs are more usable, but formatting may slow down runtimes on larger inputs num_features (int): Number of features to include in the explanation. If None, include all features select_by (one of "absolute", "min", "max"): If `num_features` is not None, method to use for selecting which features to show. Not used if num_features is None llm_model (string): One of ["gpt3.5", "gpt4"]. LLM model to use to generate the explanation. GPT4 may provide better results, but is more expensive. detail_level (string): One of ["high", "low"]. Level of detail to include in the explanation. High detail should include precise contribution values. Low detail will include only basic information about features used. context_description (string): Description of the model's prediction task, in sentence format. This will be passed to the LLM and may help produce more accurate explanations. For example: "The model predicts the price of houses." max_tokens (int): Maximum number of tokens to use in the explanation temperature (float): LLM Temperature to use. Values closer to 1 will produce more creative values. Values closer to 0 will produce more consistent or conservative explanations. Returns: dictionary (if x_orig is DataFrame) or DataFrame (if x_orig is Series) One dataframe per id, with each row representing a feature, and four columns: Feature Name Feature Value Contribution Average/Mode """ if algorithm is None: algorithm = "shap" if context_description is None: context_description = self.context_description exp = self._produce_explanation_helper( "lfc", algorithm, self.prepare_feature_contributions, format_feature_contribution_output, x_train_orig=x_train_orig, y_train=y_train, x_orig=x_orig, model_id=model_id, force_refit=force_refit, training_size=training_size, format_output=format_output, prepare_kwargs={"shap_type": shap_type}, produce_kwargs={ "llm_model": llm_model, "detail_level": detail_level, "max_tokens": max_tokens, "temperature": temperature, "num_features": num_features, "context_description": context_description, }, narrative=True, ) return exp
[docs] def prepare_feature_importance( self, x_train_orig=None, y_train=None, model_id=None, algorithm=None, shap_type=None, training_size=None, ): """ Initialize and fit a global feature importance explainer Args: model_id (int or string): Model id to explain x_train_orig (DataFrame of shape (n_instances, n_features)): Training data, if not provided at initialization. y_train (DataFrame or Series): Training targets, if not provided at initialization algorithm (string): GFI algorithm to use shap_type (string): If algorithm is "shap", type of shap to use training_size (int): Number of rows to use in fitting explainer Returns: A fit GlobalFeatureImportance explainer """ if algorithm is None: algorithm = "shap" if model_id is None: model_id = self.active_model_id explainer = GlobalFeatureImportance( self.models[model_id], transformers=self.transformers, feature_descriptions=self.feature_descriptions, e_algorithm=algorithm, classes=self.classes, class_descriptions=self.class_descriptions, shap_type=shap_type, training_size=training_size, ) explainer.fit(self._get_x_train_orig(x_train_orig), self._get_y_train(y_train)) self._add_explainer("gfi", algorithm, explainer) return explainer
[docs] def produce_feature_importance( self, model_id=None, x_train_orig=None, y_train=None, algorithm=None, format_output=True, shap_type=None, force_refit=False, training_size=None, num_features=None, select_by="absolute", ): """ Produce a GlobalFeatureImportance explainer Args: model_id (string or int): ID of model to explain x_train_orig (DataFrame): Data to fit on, if not provided during initialization y_train (DataFrame or Series): Training targets to fit on, if not provided during initialization algorithm (string): Name of algorithm format_output (Boolean): If False, return output as a single DataFrame. Formatted outputs are more usable but take longer to generate. shap_type (string): If algorithm="shap", type of SHAP explainer to use force_refit (Boolean): If True, initialize and fit a new explainer even if the appropriate explainer already exists training_size (int): Number of rows to use in fitting explainer num_features (int): Number of features to include in the explanation. If None, include all features select_by (one of "absolute", "min", "max"): If `num_features` is not None, method to use for selecting which features to show. Not used if num_features is None Returns: DataFrame with a Feature Name column and an Importance column """ if algorithm is None: algorithm = "shap" exp = self._produce_explanation_helper( "gfi", algorithm, self.prepare_feature_importance, format_feature_importance_output, model_id=model_id, x_train_orig=x_train_orig, y_train=y_train, format_output=format_output, force_refit=force_refit, training_size=training_size, prepare_kwargs={"shap_type": shap_type}, ) if num_features is not None: return get_top_contributors(exp, num_features=num_features, select_by=select_by) else: return exp
[docs] def prepare_similar_examples( self, x_train_orig=None, y_train=None, model_id=None, algorithm=None, training_size=None, standardize=False, fast=True, ): """ Initialize and fit a nearest neighbor explainer Args: model_id (int or string): Model id to explain x_train_orig (DataFrame of shape (n_instances, n_features)): Training data, if not provided at initialization. y_train (DataFrame or Series): Training targets, if not provided at initialization algorithm (string): NN algorithm to use (current options: [nn]) training_size (int): Number of rows to use in fitting explainer standardize (Boolean): If True, standardize data before using it to get similar examples. Recommended if model-ready data is not already standardized fast (Boolean): If True, use a faster algorithm for getting similar examples (disable if faiss dependency not available) Returns: A fit SimilarExamples explainer """ if algorithm is None: algorithm = "nn" if model_id is None: model_id = self.active_model_id explainer = SimilarExamples( self.models[model_id], transformers=self.transformers, feature_descriptions=self.feature_descriptions, e_algorithm=algorithm, classes=self.classes, class_descriptions=self.class_descriptions, training_size=training_size, standardize=standardize, fast=fast, ) explainer.fit(self._get_x_train_orig(x_train_orig), self._get_y_train(y_train)) self._add_explainer("se", algorithm, explainer) return explainer
[docs] def produce_similar_examples( self, x_orig, model_id=None, x_train_orig=None, y_train=None, format_output=True, num_examples=3, standardize=False, fast=True, format_y=True, algorithm=None, force_refit=False, ): """ Produce a SimilarExamples explainer Args: x_orig (DataFrame): Input to explain model_id (string or int): ID of model to explain x_train_orig (DataFrame): Data to fit on, if not provided during initialization y_train (DataFrame or Series): Training targets to fit on, if not provided during initialization format_output (Boolean): No functionality, included for consistency num_examples (int): Number of similar examples to return standardize (Boolean): If True, standardize data before using it to get similar examples. Recommended if model-ready data is not already standardized fast (Boolean): If True, use a faster algorithm for generating similar examples. Disable if faiss is not available format_y (Boolean): If True, format the ground truth y values returned using self.pred_format_func algorithm (string): Name of algorithm force_refit (Boolean): If True, initialize and fit a new explainer even if the appropriate explainer already exists Returns: {"X": DataFrame, "y": Series, "Input": Series} (if series), else {"id" -> {"X": DataFrame, "y": Series, "Input": Series}} X is the examples, ordered from top to bottom by similarity to input and y is the corresponding y values Input is the original input in the same feature space """ if algorithm is None: algorithm = "nn" format_kwargs = dict() if format_y: format_kwargs["y_format_func"] = self.pred_format_func return self._produce_explanation_helper( "se", algorithm, self.prepare_similar_examples, format_similar_examples_output, x_orig=x_orig, model_id=model_id, x_train_orig=x_train_orig, y_train=y_train, format_output=format_output, force_refit=force_refit, prepare_kwargs={"standardize": standardize, "fast": fast}, produce_kwargs={"num_examples": num_examples}, format_kwargs=format_kwargs, )
[docs] def train_feature_contribution_llm( self, x_train_orig=None, live=True, provide_examples=False, num_inputs=5, num_features=3 ): """ Run the training process for the LLM model used to generate narrative feature contribution explanations. Args: x_train_orig (DataFrame of shape (n_instances, n_features)): Training set to take sample inputs from. If None, the training set must be provided to the explainer at initialization. live (Boolean): If True, run the training process through CLI input/outputs. If False, this function will generate a shell training file that will need to be filled out and added to the RealApp manually. Currently only live training is supported. provide_examples (Boolean): If True, generate a base example of explanations at each step. This may make the process faster, but will incur costs to your OpenAI API account. num_inputs (int): Number of inputs to request. num_features (int): Number of features to include per explanation. If None, all features will be included Returns: list of (explanation, narrative) pairs The generated training data """ lfc_explainers = self._get_explainer("lfc") if not lfc_explainers: self.prepare_feature_contributions(x_train_orig=x_train_orig, algorithm="shap") lfc_explainers = self._get_explainer("lfc") training_data = None for i, algorithm in enumerate(lfc_explainers): if i == 0: training_data = lfc_explainers[algorithm].train_llm( x_train=self._get_x_train_orig(x_train_orig), live=live, provide_examples=provide_examples, num_inputs=num_inputs, num_features=num_features, ) else: lfc_explainers[algorithm].set_llm_training_data(training_data=training_data)
def set_openai_client(self, openai_client=None, openai_api_key=None): """ Set the openai client for this RealApp. One of openai_client or openai_api_key must be provided. Args: openai_client (openai.Client): OpenAI client object, with API key already set. If provided, openai_api_key is ignored openai_api_key (string): OpenAI API key. If provided, create a new API client. """ if openai_client is not None: self.openai_client = openai_client elif openai_api_key is not None: self.openai_client = OpenAI(api_key=openai_api_key) else: raise ValueError("Must provide openai_client or openai_api_key")
[docs] @staticmethod def from_sklearn( pipeline=None, model=None, transformers=None, X_train=None, y_train=None, refit_model=True, verbose=0, **kwargs ): """ Create a RealApp from a sklearn pipeline or model and transformers. Must provide one of: - just pipeline - just model - model and transformers Args: pipeline (sklearn.pipeline.Pipeline): Sklearn pipeline to convert. The final step of the pipeline must be a model. model (sklearn model): Sklearn model to use. Ignored if pipeline is not None transformers (list of sklearn transformers): List of sklearn transformers to use. Ignored if pipeline is not None X_train (DataFrame): Training data to fit transformers and explanations to. May be required if transformers are not fitted or must be recreated. If not provided, must be provided when preparing and using realapp explainers. y_train (DataFrame or Series): Training targets to fit transformers and explanations to. If not provided, must be provided when preparing and using realapp explainers. refit_model (bool): If True, refit the model using the new Pyreal transformers. This may be necessary as sklearn and Pyreal transformers may result in an unaligned column order. Requires X_train and y_train to be provided. verbose (int): Verbosity level. If 0, no output. If 1, detailed output **kwargs: Additional arguments to pass to RealApp constructor. Returns: RealApp Newly created RealApp object """ if pipeline is None and model is None and transformers is None: raise ValueError("Must provide either pipeline or model") if pipeline is not None and not hasattr(pipeline, "steps"): raise ValueError("pipeline must be a valid sklearn pipeline") pyreal_transformers = [] if pipeline is not None: model = pipeline.steps[-1][1] pyreal_transformers = sklearn_pipeline_to_pyreal_transformers( pipeline, X_train, verbose=verbose ) elif transformers is not None: pyreal_transformers = sklearn_pipeline_to_pyreal_transformers( transformers, X_train, verbose=verbose ) if refit_model: if X_train is None or y_train is None: raise ValueError("X_train and y_train must be provided to refit the model") model.fit(run_transformers(pyreal_transformers, X_train), y_train) return RealApp( models=model, transformers=pyreal_transformers, X_train_orig=X_train, y_train=y_train, **kwargs )
def _get_x_train_orig(self, x_train_orig): """ Helper function to get the appropriate x_orig or raise errors if something goes wrong Args: x_train_orig (DataFrame or None): Provided DataFrame Returns: The dataframe to use (x_orig or self.x_train_orig), may be None if neither is given """ if x_train_orig is not None: if self.id_column is not None and self.id_column in x_train_orig: return x_train_orig.drop(columns=self.id_column) return x_train_orig else: return self.X_train_orig def _get_y_train(self, y_train): """ Helper function to get the appropriate y or raise errors if something goes wrong Args: y (DataFrame or None): Provided DataFrame Returns: The dataframe to use (y or self.y_train), may be None if neither is given """ if y_train is not None: return y_train else: return self.y_train