Source code for qmla.exploration_strategies.genetic_algorithms.genetic_exploration_strategy

import numpy as np
import itertools
import sys
import os
import random
import copy
import scipy
import time

import pandas as pd
import sklearn as skl


import matplotlib
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
import seaborn as sns

try:
    from lfig import LatexFigure
except:
    from qmla.shared_functionality.latex_figure import LatexFigure
from qmla.exploration_strategies import exploration_strategy
import qmla.shared_functionality.probe_set_generation
import qmla.model_building_utilities

import qmla.shared_functionality.genetic_algorithm

__all__ = [
    "Genetic",
    "GeneticTest",
    "GeneticAlgorithmQMLAFullyConnectedLikewisePauliTerms",
]


def hamming_distance(str1, str2):
    return sum(c1 != c2 for c1, c2 in zip(str1, str2))


[docs]class Genetic(exploration_strategy.ExplorationStrategy):
    r"""
    Exploration Strategy where the model search is mediated through a genetic algorithm.
    Genetic algorithm is implemented through :class:`qmla.GeneticAlgorithmQMLA`.
    This forms the base class for genetic algorithm applications within QMLA.

    :param str exploration_rules: name of exploration strategy used
    :param list genes: terms which are permitted in the model search,
        which become genes in the chromomsomes of the genetic algorithm
    :param str true_model: name of the target model.

    """

    def __init__(self, exploration_rules, genes, true_model, **kwargs):
        super().__init__(exploration_rules=exploration_rules, **kwargs)

        self.genes = genes
        self.true_model = true_model
        self.ratings_class = qmla.shared_functionality.rating_system.ModifiedEloRating(
            initial_rating=1000, k_const=30
        )  # for use when ranking/rating models

        self.branch_champion_selection_stratgey = "fitness"  # 'ratings'
        self.fitness_method = "elo_rating"
        self.prune_completed_initially = True
        self.prune_complete = True
        self.fitness_by_f_score = pd.DataFrame()
        self.fitness_df = pd.DataFrame()
        self.num_sites = qmla.model_building_utilities.get_num_qubits(self.true_model)
        self.num_probes = 50
        self.max_num_qubits = 7
        self.hypothetical_final_generation = False

        self.qhl_models = [
            "pauliSet_1J2_zJz_d3+pauliSet_1J3_yJy_d3+pauliSet_1J3_zJz_d3+pauliSet_2J3_xJx_d3+pauliSet_2J3_zJz_d3",
            "pauliSet_1J3_yJy_d3+pauliSet_1J3_zJz_d3+pauliSet_2J3_xJx_d3+pauliSet_2J3_zJz_d3",
            "pauliSet_1J2_zJz_d3+pauliSet_1J3_zJz_d3+pauliSet_2J3_xJx_d3+pauliSet_2J3_zJz_d3",
        ]
        self.spawn_step = 0  # 1st generation's ID

        self.mutation_probability = 0.1

        if "log_file" not in kwargs:
            kwargs["log_file"] = self.log_file

        self.genetic_algorithm = (
            qmla.shared_functionality.genetic_algorithm.GeneticAlgorithmQMLA(
                genes=genes,
                num_sites=self.num_sites,
                true_model=self.true_model,
                mutation_probability=self.mutation_probability,
                **kwargs,
            )
        )

        self.true_chromosome = self.genetic_algorithm.true_chromosome
        self.true_chromosome_string = self.genetic_algorithm.true_chromosome_string

        self.num_possible_models = 2 ** len(self.true_chromosome)

        self.max_num_probe_qubits = self.num_sites

        # default test - 32 generations x 16 starters
        self.max_spawn_depth = 24
        self.initial_num_models = 16
        self.initial_models = self.genetic_algorithm.random_initial_models(
            num_models=self.initial_num_models
        )
        self.model_f_scores = {}
        self.model_points_at_step = {}
        self.generation_model_rankings = {}
        self.models_ranked_by_fitness = {}
        self.model_fitness_by_generation = {}
        self.fitness_correlations = {}

        self.tree_completed_initially = False
        self.max_num_models_by_shape = {
            self.num_sites: (self.initial_num_models * self.max_spawn_depth) / 10,
            "other": 0,
        }
        self.num_processes_to_parallelise_over = self.initial_num_models

        self.max_time_to_consider = 15
        self.min_param = 0.35
        self.max_param = 0.65

        self.fitness_mechanism_names = {
            "f_score": r"$F_1$",
            "hamming_distance": r"$H$",
            "inverse_ll": r"$g^L$",
            "inverse_ll_sq": r"$-\frac{1}{L^2}$",
            "akaike_info_criterion": r"$\frac{1}{AIC}$",
            "aic_sq": r"$\frac{1}{AIC^2}$",
            "aicc": r"$\frac{1}{AICc}$",
            "aicc_sq": r"$g^{A}$",
            "bayesian_info_criterion": r"$\frac{1}{BIC}$",
            "bic_sq": r"$g^{B}$",
            "akaike_weight": r"$w_{A}$",
            "bayes_weight": r"$w_{B}$",
            "mean_residuals": r"$r_{\mu}$",
            "mean_residuals_sq": r"$r_{\mu}^2$",
            "rs_mean": r"$1-\overline{r}$",
            "rs_median": r"$1-\tilde{r}$",
            "rs_mean_sq": r"$g^{r}$",  # r"$(1-\overline{r})^2$",
            "rs_median_sq": r"$(1-\tilde{r})^2$",
            "bf_points": r"$g^{p}$",
            "bf_rank": r"$g^{R}$",
            "elo_rating": r"$g^{E}$",
        }

[docs]    def nominate_champions(self):
        r"""Choose model with highest fitness on final generation"""

        self.champion_model = self.models_ranked_by_fitness[self.spawn_step][0]
        self.log_print(
            [
                "Final generation:",
                self.spawn_step,
                "\nModel rankings on final generation:",
                self.models_ranked_by_fitness[self.spawn_step],
                "\nChampion:",
                self.champion_model,
            ]
        )

        return [self.champion_model]

[docs]    def analyse_generation(self, model_points, model_names_ids, **kwargs):
        r"""
        Following a complete generation of the genetic algorithm,
        perform all necessary processing to enable construction of next set of models.

        :param dict model_points: the number of Bayes factor comparisons for which each candidate
            within the generation was deemed superior against a contemporary model
        :param dict model_names_ids: mapping between models' names and their IDs from the QMLA environment;
            this enables analaysing further data passed from QMLA within kwargs.
        """

        self.spawn_step += 1

        self.log_print(["Analysing generation at spawn step ", self.spawn_step])
        self.log_print(["model names ids:", model_names_ids])
        self.model_points_at_step[self.spawn_step] = model_points

        # model_names_ids = model_names_ids
        sum_wins = sum(list(model_points.values()))
        if sum_wins == 0:
            sum_wins = 1  # TODO hack to get over some times passing empty dict from update_branch -- find a better way
        model_ids = list(model_points.keys())

        # model rankings  by number of wins
        ranked_model_list = sorted(model_points, key=model_points.get, reverse=True)
        ranked_models_by_name = [model_names_ids[m] for m in ranked_model_list]
        self.log_print(
            [
                "Ranked models:",
                ranked_model_list,
                "\n Names:",
                ranked_models_by_name,
                "\n with fitnesses:",
            ]
        )

        self.generation_model_rankings[self.spawn_step] = ranked_models_by_name
        rankings = list(range(1, len(ranked_model_list) + 1))
        rankings.reverse()
        num_points = sum(rankings)  # number of points to distribute
        ranking_points = list(
            zip(ranked_models_by_name, [r / num_points for r in rankings])
        )
        ranking_points = dict(ranking_points)

        # Model ratings  (Elo ratings)
        precomputed_ratings = self.ratings_class.get_ratings(list(model_points.keys()))
        original_ratings_by_name = {
            model_names_ids[m]: precomputed_ratings[m] for m in model_ids
        }
        min_rating = min(original_ratings_by_name.values())
        ratings_by_name = {
            m: original_ratings_by_name[m] - min_rating
            for m in original_ratings_by_name
        }
        self.log_print(["Rating (as fraction of starting rating):\n", ratings_by_name])
        sum_ratings = np.sum(list(ratings_by_name.values()))
        model_elo_ratings = {
            m: ratings_by_name[m] / sum_ratings for m in ratings_by_name
        }

        # New dictionaries which can be used as fitnesses:
        model_f_scores = {"fitness_type": "f_score"}
        model_hamming_distances = {"fitness_type": "hamming_distance"}
        model_number_wins = {"fitness_type": "number_wins"}
        model_win_ratio = {"fitness_type": "win_ratio"}
        mean_residuals = {"fitness_type": "mean_residuals"}
        log_likelihoods = {"fitness_type": "log_likelihoods"}

        # Alter finished dicts also useable as fitness
        # log_likelihoods['fitness_type'] = 'log_likelihoods'
        model_elo_ratings["fitness_type"] = "elo_ratings"
        ranking_points["fitness_type"] = "ranking"

        model_instances = [self.tree.model_storage_instances[m] for m in model_ids]
        aic_values = {
            model.model_id: model.akaike_info_criterion for model in model_instances
        }
        aicc_values = {
            model.model_id: model.akaike_info_criterion_c for model in model_instances
        }
        min_aicc = min(aicc_values.values())
        self.log_print(
            ["At generation {}, AIC of models: {}".format(self.spawn_step, aic_values)]
        )

        # store info on each model for analysis
        for m in model_ids:
            # Access the model storage instance and retrieve some attributes from there
            model_storage_instance = self.tree.model_storage_instances[m]
            self.log_print(["Model storage instance:", model_storage_instance])
            mod = model_storage_instance.model_name
            model_number_wins[mod] = model_points[m]
            hamming_dist = self.hamming_distance_model_comparison(
                test_model=mod
            )  # for fitness use 1/H
            model_hamming_distances[mod] = (
                self.genetic_algorithm.num_terms - hamming_dist
            ) / self.genetic_algorithm.num_terms
            model_f_scores[mod] = np.round(
                self.f_score_model_comparison(test_model=mod), 2
            )  # TODO get from model instance
            self.model_f_scores[m] = model_f_scores[mod]
            model_win_ratio[mod] = model_number_wins[mod] / sum_wins

            # store scores for offline analysis
            this_model_fitnesses = {
                # When adding a new fitness fnc -- add a name in self.fitness_mechanism_names
                "model": mod,
                "model_id": m,
                "generation": self.spawn_step,
                # absolute metrics (not available in real experiments)
                "f_score": model_f_scores[mod],
                "hamming_distance": model_hamming_distances[mod],
                # from storage instance
                # 'eval_log_likelihood' : model_storage_instance.evaluation_log_likelihood,
                "inverse_ll": -1 / model_storage_instance.evaluation_log_likelihood,
                "inverse_ll_sq": (-1 / model_storage_instance.evaluation_log_likelihood)
                ** 2,
                "akaike_info_criterion": 1
                / model_storage_instance.akaike_info_criterion,
                "aicc": 1 / model_storage_instance.akaike_info_criterion_c,
                "aic_sq": (1 / model_storage_instance.akaike_info_criterion) ** 2,
                "aicc_sq": (1 / model_storage_instance.akaike_info_criterion_c) ** 2,
                "bayesian_info_criterion": (
                    1 / model_storage_instance.bayesian_info_criterion
                ),
                "bic_sq": (1 / model_storage_instance.bayesian_info_criterion) ** 2,
                "akaike_weight": np.e
                ** ((min_aicc - model_storage_instance.akaike_info_criterion_c) / 2),
                "bayes_weight": np.e
                ** (-1 * model_storage_instance.bayesian_info_criterion / 2),
                "mean_residuals": 1 - model_storage_instance.evaluation_mean_pr0_diff,
                "mean_residuals_sq": (
                    1 - model_storage_instance.evaluation_mean_pr0_diff
                )
                ** 2,
                "rs_mean": 1
                - model_storage_instance.evaluation_residual_squares["mean"],
                "rs_median": 1
                - model_storage_instance.evaluation_residual_squares["median"],
                "rs_mean_sq": (
                    1 - model_storage_instance.evaluation_residual_squares["mean"]
                )
                ** 2,
                "rs_median_sq": (
                    1 - model_storage_instance.evaluation_residual_squares["median"]
                )
                ** 2,
                # relative to other models in this branch
                "bf_points": model_win_ratio[mod],
                "bf_rank": ranking_points[mod],
                "elo_rating": model_elo_ratings[mod],
                # 'original_elo_rating' : original_ratings_by_name[mod],
            }

            self.fitness_by_f_score = self.fitness_by_f_score.append(
                pd.Series(this_model_fitnesses), ignore_index=True
            )

            recorded_fitness_types = list(
                this_model_fitnesses.keys()
                - [
                    "model",
                    "model_id",
                    "generation",
                    "hamming_distance",
                ]
            )
            for f in recorded_fitness_types:
                try:
                    new_entry = pd.Series(
                        {
                            "generation": this_model_fitnesses["generation"],
                            "f_score": this_model_fitnesses["f_score"],
                            "fitness": this_model_fitnesses[f],
                            "fitness_type": f,
                            "fitness_type_name": self.fitness_mechanism_names[f],
                            "active_fitness_method": self.fitness_method == f,
                        }
                    )
                    self.fitness_df = self.fitness_df.append(
                        new_entry, ignore_index=True
                    )
                except:
                    self.log_print(
                        [
                            "fitness name keys:",
                            list(self.fitness_mechanism_names.keys())
                            # "f={}; type name = {}".format(f, self.fitness_mechanism_names[f])
                        ]
                    )
                    raise

        # Extract fitness specified by user (exploration strategy's fitness_method attribute)
        # to use for generating models within genetic algorithm
        fitnesses = self.fitness_by_f_score[
            self.fitness_by_f_score.generation == self.spawn_step
        ][["model", self.fitness_method]]

        genetic_algorithm_fitnesses = dict(
            zip(fitnesses["model"], fitnesses[self.fitness_method])
        )

        self.log_print(
            [
                "fitness method:{} => Fitnesses={}".format(
                    self.fitness_method, genetic_algorithm_fitnesses
                )
            ]
        )
        self.models_ranked_by_fitness[self.spawn_step] = sorted(
            genetic_algorithm_fitnesses,
            key=genetic_algorithm_fitnesses.get,
            reverse=True,
        )
        self.model_fitness_by_generation[self.spawn_step] = genetic_algorithm_fitnesses

        self.genetic_algorithm.consolidate_generation(
            model_fitnesses=genetic_algorithm_fitnesses
        )

        # return genetic_algorithm_fitnesses
        return self.models_ranked_by_fitness[self.spawn_step]

[docs]    def generate_models(self, model_list, **kwargs):
        r"""
        Model generation using genetic algorithm.

        Follows rules of :meth:`~qmla.exploration_strategies.ExplorationStrategy.generate_models`.
        """

        # Analysis of the previous generation is called by the exploration strategy tree.
        genetic_algorithm_fitnesses = self.model_fitness_by_generation[self.spawn_step]

        self.log_print(
            [
                "Spawn step:",
                self.spawn_step,
            ]
        )

        # Spawn models from genetic algorithm
        new_models = self.genetic_algorithm.genetic_algorithm_step(
            model_fitnesses=genetic_algorithm_fitnesses,
            num_pairs_to_sample=self.initial_num_models
            / 2,  # for every pair, 2 chromosomes proposed
        )

        return new_models

    def finalise_model_learning(self, **kwargs):
        return

[docs]    def hamming_distance_model_comparison(
        self,
        test_model,
        target_model=None,
    ):
        r"""
        Compare test_model with target_model by Hamming distance
        """

        if target_model is None:
            target_model = self.true_chromosome_string
        else:
            target_model = self.genetic_algorithm.chromosome_string(
                self.genetic_algorithm.map_model_to_chromosome(target_model)
            )
        test_model = self.genetic_algorithm.chromosome_string(
            self.genetic_algorithm.map_model_to_chromosome(test_model)
        )

        h = sum(c1 != c2 for c1, c2 in zip(test_model, target_model))
        return h

[docs]    def f_score_model_comparison(
        self,
        test_model,
        target_model=None,
        beta=1,
    ):
        r"""
        Get F score of candidate model, measure of overlap between the terms of the candidate and target model

        :param str test_model: name of candidate model
        :param str target_model: name of target model, if None, assumed that target is self.true_model
        :param float beta: relative importance of precision to sensitivity. in general this is F-beta score,
            usually beta = 1
        """
        if target_model is None:
            target_model = self.true_model

        true_set = set(
            self.latex_name(mod)
            for mod in qmla.model_building_utilities.get_constituent_names_from_name(
                target_model
            )
        )
        terms = [
            self.latex_name(term)
            for term in qmla.model_building_utilities.get_constituent_names_from_name(
                test_model
            )
        ]
        learned_set = set(sorted(terms))

        total_positives = len(true_set)
        true_positives = len(true_set.intersection(learned_set))
        false_positives = len(learned_set - true_set)
        false_negatives = len(true_set - learned_set)
        precision = true_positives / (true_positives + false_positives)
        sensitivity = true_positives / total_positives
        try:
            f_score = (1 + beta ** 2) * (
                (precision * sensitivity) / (beta ** 2 * precision + sensitivity)
            )
        except BaseException:
            # both precision and sensitivity=0 as true_positives=0
            f_score = 0
        return f_score

[docs]    def f_score_from_chromosome_string(
        self,
        chromosome,
    ):
        r"""
        F1 score between chromosome and true model
        """

        mod = np.array([int(a) for a in list(chromosome)])

        try:
            f = skl.metrics.f1_score(mod, self.true_chromosome)
            return f
        except:
            self.log_print(
                [
                    "F score from chromosome {} with mod {} not working against true chrom {}".format(
                        mod, chromosome, self.true_chromosome
                    )
                ]
            )
            raise

[docs]    def exploration_strategy_finalise(self):
        r"""
        Genetic algorithm specific version of :meth:`qmla.ExplorationStrategy.exploration_strategy_finalise`.
        """

        # hypothetical generation_models
        if self.hypothetical_final_generation:
            # TODO this will cause a crash in QHL mode since.
            # in general this should be turned off so not worth a large fix
            self.log_print(["Running hypothetical step to get some models"])
            hypothetical_models = self.genetic_algorithm.genetic_algorithm_step(
                model_fitnesses=self.model_fitness_by_generation[self.spawn_step - 1],
                num_pairs_to_sample=self.initial_num_models
                / 2,  # for every pair, 2 chromosomes proposed
            )
            self.log_print(["hypothetical generation models:", hypothetical_models])

        self.storage.fitness_correlations = self.fitness_correlations
        self.storage.fitness_by_f_score = self.fitness_by_f_score
        self.storage.fitness_df = self.fitness_df
        self.storage.true_model_chromosome = self.true_chromosome_string
        self.storage.ratings_df = self.ratings_class.ratings_df
        gene_pool = self.genetic_algorithm.gene_pool
        gene_pool["objective_function"] = self.fitness_mechanism_names[
            self.fitness_method
        ]
        self.storage.gene_pool = gene_pool
        birth_register = self.genetic_algorithm.birth_register
        birth_register["objective_function"] = self.fitness_mechanism_names[
            self.fitness_method
        ]
        birth_register["max_time_considered"] = self.max_time_to_consider
        self.storage.birth_register = birth_register
        self.storage.ratings = self.ratings_class.all_ratings

        chromosomes = sorted(
            list(set(self.genetic_algorithm.previously_considered_chromosomes))
        )
        self.unique_chromosomes = pd.DataFrame(
            columns=[
                "chromosome",
                "numeric_chromosome",
                "f_score",
                "num_terms",
                "hamming_distance",
            ]
        )
        for c in chromosomes:
            hamming_dist = self.hamming_distance_model_comparison(
                test_model=self.genetic_algorithm.map_chromosome_to_model(c)
            )  # for fitness use 1/H

            chrom_data = pd.Series(
                {
                    "chromosome": str(c),
                    "numeric_chromosome": int(c, 2),
                    "num_terms": self.genetic_algorithm.num_terms,
                    "hamming_distance": hamming_dist,
                    "f_score": np.round(self.f_score_from_chromosome_string(c), 3),
                }
            )
            self.unique_chromosomes.loc[len(self.unique_chromosomes)] = chrom_data
        self.log_print(["self.unique_chromosomes:\n", self.unique_chromosomes])
        self.storage.unique_chromosomes = self.unique_chromosomes

        dud_chromosome = str("1" + "0" * self.genetic_algorithm.num_terms)
        if dud_chromosome in chromosomes:
            self.log_print(
                [
                    "{} in previous chromosomes:\n{}".format(
                        dud_chromosome,
                        self.genetic_algorithm.previously_considered_chromosomes,
                    )
                ]
            )
        chromosome_numbers = sorted([int(c, 2) for c in chromosomes])
        # self.exploration_strategy_specific_data_to_store['chromosomes_tested'] = chromosome_numbers
        try:
            f_scores = []
            for c in chromosomes:
                try:
                    f_scores.append(np.round(self.f_score_from_chromosome_string(c), 3))
                except:
                    self.log_print(
                        ["Could not compute f score for chromosome: {}".format(c)]
                    )
            # self.exploration_strategy_specific_data_to_store['f_score_tested_models' ] = f_scores
        except:
            self.log_print(
                [
                    "Could not compute f score for chromosome list: {}".format(
                        chromosomes
                    )
                ]
            )
            pass

        self.storage.chromosomes_tested = chromosome_numbers
        self.storage.f_score_tested_models = f_scores

[docs]    def check_tree_completed(self, spawn_step, **kwargs):
        r"""
        Genetic algorithm specific version of :meth:`qmla.ExplorationStrategy.check_tree_completed`.
        """

        if self.spawn_step == self.max_spawn_depth:
            self.log_print(["Terminating at spawn depth ", self.spawn_step])
            return True
        elif self.genetic_algorithm.best_model_unchanged:
            self.champion_determined = True
            self.champion_model = (
                self.genetic_algorithm.most_elite_models_by_generation[
                    self.genetic_algorithm.genetic_generation - 1
                ]
            )

            self.log_print(
                [
                    "Terminating search early (after {} generations) b/c elite model unchanged in {} generations.".format(
                        self.genetic_algorithm.genetic_generation,
                        self.genetic_algorithm.unchanged_elite_num_generations_cutoff,
                    ),
                    "\nDeclaring champion:",
                    self.champion_model,
                ]
            )
            # check if elite model hasn't changed in last N generations
            return True
        else:
            self.log_print(["Elite models changed recently; continuing search."])
            return False

[docs]    def check_tree_pruned(self, **kwargs):
        r"""
        Genetic algorithm specific version of :meth:`qmla.ExplorationStrategy.check_tree_pruned`.
        """
        # no pruning for GA, winner is champion of final branch
        return True

[docs]    def set_specific_plots(self, **kwargs):
        r"""
        Genetic algorithm specific version of :meth:`qmla.ExplorationStrategy.set_specific_plots`.
        """

        self.plot_methods_by_level = {
            1: [],
            2: [
                self._plot_correlation_fitness_with_f_score,
                self._plot_fitness_v_fscore_by_generation,
                self.__plot_gene_pool_progression,
            ],
            3: [
                self._plot_fitness_v_fscore,
                self._plot_fitness_v_generation,
            ],
            4: [
                self._plot_model_ratings,
                self._plot_gene_pool,
            ],
            5: [self.plot_generational_metrics, self._plot_selection_probabilities],
            6: [],
        }

        # Plots that need arguments so are called individually
        if self.plot_level >= 2:
            try:
                self.ratings_class.plot_models_ratings_against_generation(
                    f_scores=self.model_f_scores,
                    save_directory=self.save_directory,
                    f_score_cmap=self.f_score_cmap,
                    figure_format=self.figure_format,
                )
            except Exception as e:
                self.log_print(
                    [
                        "plot failed plot_models_ratings_against_generation with error ",
                        e,
                    ]
                )

            try:
                self.ratings_class.plot_rating_progress_single_model(
                    target_model_id=champion_model_id,
                    save_to_file=os.path.join(
                        self.save_directory, "ratings_progress_champion.png"
                    ),
                )
                if true_model_id != -1 and true_model_id != champion_model_id:
                    self.ratings_class.plot_rating_progress_single_model(
                        target_model_id=true_model_id,
                        save_to_file=os.path.join(
                            save_directory, "ratings_progress_true_model.png"
                        ),
                    )
            except Exception as e:
                self.log_print(
                    ["plot failed plot_rating_progress_single_model with error ", e]
                )

[docs]    def _plot_correlation_fitness_with_f_score(
        self,
        save_to_file=None,
    ):
        r"""
        Show how the fitness of models at each generation progress in terms of F score.
        """

        plt.clf()
        correlations = pd.DataFrame(columns=["Generation", "Method", "Correlation"])
        fitness_types_to_ignore = ["f_score", "hamming_distance"]
        for t in self.fitness_df.fitness_type.unique():
            if t not in fitness_types_to_ignore:
                this_fitness_type = self.fitness_df[
                    self.fitness_df["fitness_type"] == t
                ]

                for g in this_fitness_type.generation.unique():
                    this_type_this_gen = this_fitness_type[
                        this_fitness_type.generation == g
                    ]

                    corr = this_type_this_gen["f_score"].corr(
                        this_type_this_gen["fitness"]
                    )
                    cov = this_type_this_gen["f_score"].cov(
                        this_type_this_gen["fitness"]
                    )

                    corr = {
                        "Generation": g,
                        "Method": self.fitness_mechanism_names[t],
                        # 'Method' : t,
                        "Correlation": corr,
                        "Covariance": cov,
                    }
                    correlations = correlations.append(
                        pd.Series(corr), ignore_index=True
                    )

        self.fitness_correlations = correlations
        self.log_print(["fitness correlations:\n", self.fitness_correlations])
        fig, ax = plt.subplots(figsize=(15, 10))

        if len(correlations.Generation.unique()) == 1:
            sns.scatterplot(
                y="Correlation",
                x="Generation",
                # style= 'Method',
                hue="Method",
                data=correlations,
                ax=ax,
                # markers = ['*', 'X', '<', '^'],
            )
        else:
            sns.lineplot(
                y="Correlation",
                x="Generation",
                # style= 'Method',
                hue="Method",
                data=correlations,
                ax=ax,
                markers=["*", "X", "<", "^"],
            )
        ax.axhline(0, ls="--", c="k")

        if save_to_file is None:
            save_to_file = os.path.join(
                self.save_directory,
                "correlations_bw_fitness_and_f_score.png".format(self.qmla_id),
            )

        plt.savefig(save_to_file)

[docs]    def _plot_fitness_v_generation(self, save_to_file=None):
        r"""
        Plot progression of fitness against generations of the genetic algorithm.
        """
        import matplotlib.pyplot as plt
        import seaborn as sns

        plt.clf()
        fig, ax = plt.subplots()
        sns.set(rc={"figure.figsize": (11.7, 8.27)})

        cmap = sns.cubehelix_palette(dark=0.3, light=0.8, as_cmap=True)
        sns.boxplot(
            x="generation",
            y="fitness",
            data=self.fitness_df[
                # self.fitness_df['fitness_type'] == 'model_hamming_distances'
                self.fitness_df["active_fitness_method"]
                == True
            ],
            ax=ax,
        )
        ax.legend(loc="lower right")
        ax.set_xlabel("Generation")
        ax.set_ylabel("Fitness")
        ax.set_title("Fitness method: {}".format(self.fitness_method))
        # ax.set_xlim((0,1))
        if save_to_file is None:
            save_to_file = os.path.join(
                self.save_directory,
                "fitness_v_generation.{}".format(self.figure_format),
            )

        plt.savefig(save_to_file)

[docs]    def _plot_fitness_v_fscore_by_generation(
        self,
    ):
        r"""
        Plot fitness vs f score throughout generations of the genetic algorithm.
        """

        plt.clf()
        sanity_check_df = self.fitness_df[
            (self.fitness_df["fitness_type"] == "f_score")
            | (self.fitness_df["fitness_type"] == "model_hamming_distances")
        ]
        candidate_fitnesses = self.fitness_df[
            (self.fitness_df["fitness_type"] == "elo_rating")
            | (self.fitness_df["fitness_type"] == "ranking")
            | (self.fitness_df["fitness_type"] == "model_win_ratio")
        ]

        g = sns.FacetGrid(
            candidate_fitnesses,
            row="generation",
            hue="fitness_type",
            hue_kws=dict(marker=["x", "+", "*"]),
            # col_wrap=5,
            xlim=(-0.1, 1.1),
            # ylim=(0,1),
            size=4,
            aspect=2,
        )
        g = g.map(plt.scatter, "f_score", "fitness").add_legend()

        save_to_file = os.path.join(
            self.save_directory, "fitness_types.{}".format(self.figure_format)
        )
        plt.savefig(save_to_file)

[docs]    def _plot_model_ratings(
        self,
    ):
        r"""
        Plot ratings of models on all generations, as determined by the RatingSystem
        """

        plt.clf()
        ratings = self.ratings_class.all_ratings
        generations = [int(g) for g in ratings.generation.unique()]
        num_generations = len(generations)

        lf = LatexFigure(use_gridspec=True, gridspec_layout=(num_generations, 1))

        # TODO : unique linestyle and colour combo for each model ID and tracks across subplots
        ratings["Model ID"] = ratings["model_id"]

        for gen in generations:
            ax = lf.new_axis()

            this_gen_ratings = ratings[ratings.generation == gen]
            colours = {
                m: self.f_score_cmap(self.model_f_scores[m])
                for m in this_gen_ratings["model_id"]
            }
            sns.lineplot(
                x="idx",
                y="rating",
                hue=r"Model ID",
                hue_order=sorted(this_gen_ratings.model_id.unique()),
                data=this_gen_ratings,
                ax=ax,
                legend="full",
                palette=colours,
            )

            ax.set_title("Generation {}".format(gen), pad=-15)
            ax.set_xlabel("")
            ax.set_ylabel("Elo rating")
            ax.legend(bbox_to_anchor=(1, 1))

        save_to_file = os.path.join(self.save_directory, "ratings".format(self.qmla_id))

        lf.save(save_to_file, file_format=self.figure_format)

[docs]    def _plot_fitness_v_fscore(self):
        r"""
        Plot fitness against f score
        """

        plt.clf()
        fig, ax = plt.subplots()
        sns.set(rc={"figure.figsize": (11.7, 8.27)})

        cmap = sns.cubehelix_palette(dark=0.3, light=0.8, as_cmap=True)
        sns.scatterplot(
            x="f_score",
            y="elo_rating",
            # hue='generation',
            # palette = cmap,
            label="Rating",
            data=self.fitness_by_f_score,
            ax=ax,
        )

        sns.scatterplot(
            x="f_score",
            y="win_ratio",
            # hue='generation',
            # palette = cmap,
            label="Win ratio",
            data=self.fitness_by_f_score,
            ax=ax,
        )

        ax.legend(loc="lower right")
        ax.set_xlabel("F score")
        ax.set_ylabel("Fitness (as probability)")
        # bplot.set_ylim((0,1))
        ax.set_xlim((-0.05, 1.05))
        save_to_file = os.path.join(
            self.save_directory, "fitness_v_fscore.png".format(self.qmla_id)
        )

        ax.figure.savefig(save_to_file)

[docs]    def _plot_gene_pool(self):
        r"""
        Show the F scores of all models in all generations
        """
        ga = self.genetic_algorithm

        plt.clf()
        fig, axes = plt.subplots(
            figsize=(10, 8),
            constrained_layout=True,
        )

        gs = GridSpec(nrows=2, ncols=1, height_ratios=[7, 1])
        label_fontsize = 10
        # TODO get f score cmap from exploration strategy
        # f_score_cmap = matplotlib.colors.ListedColormap(["sienna", "red", "darkorange", "gold", "blue"])
        f_score_cmap = self.f_score_cmap

        # Bar plots for probability of gene being selected, coloured by f score
        ax = fig.add_subplot(gs[0, 0])

        generations = list(sorted(ga.gene_pool.generation.unique()))
        probability_grouped_by_f_by_generation = {
            g: {
                f: ga.gene_pool[
                    (ga.gene_pool.f_score == f) & (ga.gene_pool.generation == g)
                ].probability.sum()
                for f in ga.gene_pool.f_score.unique()
            }
            for g in generations
        }
        probability_grouped_by_f_by_generation = pd.DataFrame(
            probability_grouped_by_f_by_generation
        ).T

        sorted_f_scores = list(sorted(ga.gene_pool.f_score.unique()))
        below = [0] * len(generations)
        for f in sorted_f_scores[:]:
            probs_this_f = list(probability_grouped_by_f_by_generation[f])
            ax.bar(
                generations,
                probs_this_f,
                color=f_score_cmap(f),
                bottom=below,
                edgecolor=["black"] * len(generations),
            )

            below = [b + p for b, p in zip(below, probs_this_f)]
        ax.set_xticks(generations)
        ax.set_ylabel("Probability", fontsize=label_fontsize)
        ax.set_xlabel("Generation", fontsize=label_fontsize)
        ax.set_title("Gene pool", fontsize=label_fontsize)

        # Colour bar
        ax = fig.add_subplot(gs[1, 0])
        sm = plt.cm.ScalarMappable(
            cmap=f_score_cmap, norm=plt.Normalize(vmin=0, vmax=1)
        )
        sm.set_array(np.linspace(0, 1, 100))
        plt.colorbar(sm, cax=ax, orientation="horizontal")
        ax.set_xlabel("F-score", fontsize=label_fontsize)

        # Save figure
        save_to_file = os.path.join(
            self.save_directory, "gene_pool.{}".format(self.figure_format)
        )

        fig.savefig(save_to_file)

[docs]    def _plot_selection_probabilities(self):
        r"""
        Plot pie charts of the selection probabilities of prospective parents at each generation.
        Models are signified by their F score.
        """
        generations = sorted(self.genetic_algorithm.gene_pool.generation.unique())
        self.log_print(["[_plot_selection_probabilities] generations:", generations])
        lf = LatexFigure(auto_gridspec=len(generations))

        for g in generations:
            ax = lf.new_axis()
            this_gen_genes = self.genetic_algorithm.gene_pool[
                self.genetic_algorithm.gene_pool.generation == g
            ]
            f_scores = this_gen_genes.f_score
            colours = [self.f_score_cmap(f) for f in f_scores]
            probabilities = this_gen_genes.probability

            ax.pie(
                probabilities,
                colors=colours,
                radius=2,
            )

        save_to_file = os.path.join(self.save_directory, "selection_probabilities")
        lf.save(save_to_file, figure_format=self.figure_format)

[docs]    def plot_generational_metrics(self):
        r"""
        Show various metrics across all generations
        """
        fig, axes = plt.subplots(figsize=(15, 10), constrained_layout=True)
        gs = GridSpec(
            nrows=2,
            ncols=1,
        )

        ax = fig.add_subplot(gs[0, 0])
        sns.boxplot(y="f_score", x="generation", data=self.fitness_by_f_score, ax=ax)
        ax.set_ylabel("F-score")
        ax.set_xlabel("Generation")
        ax.set_title("F score")
        ax.set_ylim(0, 1)
        ax.legend()

        ax = fig.add_subplot(gs[1, 0])
        sns.boxplot(
            y="log_likelihood", x="generation", data=self.fitness_by_f_score, ax=ax
        )
        ax.set_ylabel("log-likelihood")
        ax.set_xlabel("Generation")
        ax.set_title("Evaluation log likeihood")
        ax.legend()

        # Save figure
        save_to_file = os.path.join(self.save_directory, "generation_progress.png")

        fig.savefig(save_to_file)

    def __plot_gene_pool_progression(
        self,
    ):
        r"""
        Succinct representation of the progression of gene pool with respect to F score.
        """
        lf = LatexFigure()
        ax = lf.new_axis()
        gene_pool = self.genetic_algorithm.gene_pool
        gene_pool.sort_values("f_score", inplace=True, ascending=False)

        self.gene_pool_progression(
            gene_pool=gene_pool,
            ax=ax,
            f_score_cmap=self.f_score_cmap,
        )
        lf.save(
            save_to_file=os.path.join(self.save_directory, "gene_pool_progression"),
            file_format=self.figure_format,
        )

[docs]    @staticmethod
    def gene_pool_progression(
        gene_pool, ax, f_score_cmap=None, draw_cbar=True, cbar_ax=None
    ):
        r"""
        Method for plotting succinct summary of progression of gene pool with respect to F score.
        """
        if f_score_cmap is None:
            f_score_cmap = matplotlib.cm.RdBu
        num_models_per_generation = len(gene_pool[gene_pool.generation == 1])
        num_generations = gene_pool.generation.nunique()
        f_scores_of_gene_pool = np.empty((num_models_per_generation, num_generations))
        for g in gene_pool.generation.unique():

            f_scores_by_gen = gene_pool[gene_pool.generation == g].f_score

            f_scores_of_gene_pool[:, g - 1] = f_scores_by_gen

        sns.heatmap(
            f_scores_of_gene_pool,
            cmap=f_score_cmap,
            vmin=0,
            vmax=1,
            ax=ax,
            cbar=draw_cbar,
            cbar_kws=dict(
                label=r"$F_1$-score",
                aspect=25,
                ticks=[0, 0.5, 1],
            ),
        )
        ax.set_yticks([])
        xtick_pos = range(5, num_generations + 1, 5)
        ax.set_xticks([g - 0.5 for g in xtick_pos])
        ax.set_xticklabels(xtick_pos)
        ax.set_xlabel("Generation")

        if cbar_ax is not None:
            cbar = ax.collections[0].colorbar
            cbar.ax.set_ylabel(r"$F_1$", rotation=0, labelpad=10)  # if F horizontal
            cbar.ax.yaxis.set_label_position(
                "right",
            )
            cbar.ax.tick_params(labelleft=True, labelright=False)


class GeneticTest(Genetic):
    r"""
    Exactly as the genetic exploration strategy, but small depth to test quickly.

    """

    def __init__(self, exploration_rules, **kwargs):
        true_model = "pauliSet_1J2_zJz_d4+pauliSet_1J3_zJz_d4+pauliSet_2J3_zJz_d4+pauliSet_2J4_zJz_d4+pauliSet_3J4_zJz_d4"
        self.true_model = qmla.model_building_utilities.alph(true_model)
        num_sites = qmla.model_building_utilities.get_num_qubits(true_model)
        terms = []
        for i in range(1, 1 + num_sites):
            for j in range(i + 1, 1 + num_sites):
                for t in ["x", "y", "z"]:
                    new_term = "pauliSet_{i}J{j}_{o}J{o}_d{N}".format(
                        i=i,
                        j=j,
                        o=t,
                        N=num_sites,
                    )
                    terms.append(new_term)

        super().__init__(
            exploration_rules=exploration_rules,
            genes=terms,
            true_model=self.true_model,
            **kwargs,
        )
        self.max_spawn_depth = 2
        self.max_num_probe_qubits = self.num_sites
        self.initial_num_models = 6
        self.initial_models = self.genetic_algorithm.random_initial_models(
            num_models=self.initial_num_models
        )
        self.tree_completed_initially = False
        self.max_num_models_by_shape = {
            self.num_sites: (self.initial_num_models * self.max_spawn_depth) / 10,
            "other": 0,
        }
        self.num_processes_to_parallelise_over = self.initial_num_models


class GeneticAlgorithmQMLAFullyConnectedLikewisePauliTerms(Genetic):
    r"""
    Exact structure of :class:`~qmla.Genetic`, where the avaiable terms
    are assumed to follow conventional pauliSet format, and all sites are connected.
    e.g. terms of the form:
    pauliSet_1J2_xJx_d2, pauliSet_1J2_yJy_d2, pauliSet_1J2_zJz_d2,
    """

    def __init__(
        self,
        exploration_rules,
        true_model,
        num_sites=None,
        base_terms=["x", "y", "z"],
        **kwargs
    ):
        if num_sites is None:
            num_sites = qmla.model_building_utilities.get_num_qubits(true_model)
        terms = []
        for i in range(1, 1 + num_sites):
            for j in range(i + 1, 1 + num_sites):
                for t in base_terms:
                    new_term = "pauliSet_{i}J{j}_{o}J{o}_d{N}".format(
                        i=i,
                        j=j,
                        o=t,
                        N=num_sites,
                    )
                    terms.append(new_term)

        super().__init__(
            exploration_rules=exploration_rules,
            genes=terms,
            true_model=true_model,
            **kwargs,
        )
Source code for qmla.exploration_strategies.genetic_algorithms.genetic_exploration_strategy

Quantum Model Learning Agent

Navigation

Related Topics