Source code for amorphgen.analysis.analyser

"""
StructureAnalyser — main class that delegates to submodules.

Usage
-----
    from amorphgen.analysis import StructureAnalyser

    sa = StructureAnalyser("output_dir/", cutoff="auto")
    sa.summary()
    sa.plot(output_dir="plots/")
"""

from __future__ import annotations

import os
import glob
import numpy as np
from collections import defaultdict
from ase.io import read

from .cutoff import auto_cutoff_minsep, auto_cutoff_rdf
from .structure import (compute_density, compute_coordination,
                        compute_bond_distances, compute_all_angles,
                        compute_bond_angle_stats, build_neighbour_dict)
from .rdf import (compute_rdf, compute_structure_factor,
                  compute_averaged_rdf)
from .rings import compute_ring_statistics
from .voronoi import compute_voronoi
from .energy import compute_energy_ranking
from .plotting import plot_analysis



[docs]
class StructureAnalyser:
    """
    Analyse amorphous structures: density, coordination, distances,
    angles, RDF, S(q), rings, Voronoi, energy ranking.

    Parameters
    ----------
    source : str or list of str
        Path to a structure file, a directory, or a list of file paths.
    cutoff : float, dict, or str
        - float: single cutoff (A) for all pairs
        - dict: pair-specific, e.g. {"Si-O": 2.2}
        - "auto": from bonding radii (fast)
        - "auto-rdf": from RDF first minimum (accurate)
    """


[docs]
    def __init__(self, source, cutoff="auto"):
        """
        Parameters
        ----------
        source : str, list of str, or list of Atoms
            Path to a structure file, a directory of structure files,
            a list of file paths, or a list of ASE Atoms objects.
        cutoff : float, dict, or str
            Neighbour cutoff for CN, bond distances, and angles.
            - float: single cutoff (A) for all pairs
            - dict: pair-specific, e.g. {"Si-O": 2.2, "O-O": 2.8}
            - "auto": from bonding radii (fast, default)
            - "auto-rdf": from RDF first minimum (more accurate)
        """
        self.atoms_list = self._load(source)
        if not self.atoms_list:
            raise ValueError("No structures loaded. Check the source path.")

        # Warn if structures have different compositions
        formulas = set(a.get_chemical_formula(mode="hill")
                       for a in self.atoms_list)
        if len(formulas) > 1:
            import warnings
            warnings.warn(
                f"Structures have different compositions: {formulas}. "
                f"Analysis assumes uniform composition; energy/atom and "
                f"CN statistics may mix incompatible systems."
            )

        self._cutoff_mode = cutoff

        if cutoff == "auto":
            self.cutoff = auto_cutoff_minsep(self.atoms_list)
            self._cutoff_mode = "auto (minsep)"
        elif cutoff == "auto-rdf":
            self.cutoff = auto_cutoff_rdf(self.atoms_list)
            self._cutoff_mode = "auto (RDF)"
        else:
            self.cutoff = cutoff

        self._max_cutoff = (
            max(self.cutoff.values()) if isinstance(self.cutoff, dict)
            else self.cutoff
        )


    @staticmethod
    def _load(source):
        from ase import Atoms
        if isinstance(source, list):
            if source and isinstance(source[0], Atoms):
                return source
            return [read(f) for f in source]
        if os.path.isdir(source):
            files = sorted(
                glob.glob(os.path.join(source, "*.xyz"))
                + glob.glob(os.path.join(source, "*.extxyz"))
                + glob.glob(os.path.join(source, "*.cif"))
                + glob.glob(os.path.join(source, "*.vasp"))
            )
            if not files:
                raise FileNotFoundError(f"No structure files in {source}/")
            return [read(f) for f in files]
        return [read(source)]

    def _get_cutoff(self, s1, s2):
        if isinstance(self.cutoff, (int, float)):
            return self.cutoff
        key1 = f"{s1}-{s2}"
        key2 = f"{s2}-{s1}"
        return self.cutoff.get(key1, self.cutoff.get(key2, self._max_cutoff))

    def _build_neighbour_dict(self, atoms):
        return build_neighbour_dict(atoms, self._max_cutoff, self._get_cutoff)

    # ── Core analysis methods ────────────────────────────────────────────


[docs]
    def density(self):
        """Compute density (g/cm3) for each structure.

        Returns
        -------
        dict
            {"values": list[float], "mean": float, "std": float}
        """
        return compute_density(self.atoms_list)



[docs]
    def coordination(self, pair=None):
        """Compute coordination numbers across all structures.

        Parameters
        ----------
        pair : str, optional
            Pair to analyse, e.g. "Si-O". If None, all pairs computed.

        Returns
        -------
        dict
            Keyed by pair string. Each value is a dict with
            "mean", "std", "min", "max", "distribution", "total_atoms".
        """
        return compute_coordination(self.atoms_list, self._max_cutoff,
                                    self._get_cutoff, pair)



[docs]
    def bond_distances(self, pair=None):
        """Compute bond distance statistics.

        Parameters
        ----------
        pair : str, optional
            Pair to analyse, e.g. "Si-O". If None, all pairs computed.

        Returns
        -------
        dict
            Keyed by pair string. Each value is a dict with
            "mean", "std", "min", "max", "count".
        """
        return compute_bond_distances(self.atoms_list, self._max_cutoff,
                                      self._get_cutoff, pair)


    def _compute_all_angles(self, triplet=None, bonding_only=True):
        return compute_all_angles(self.atoms_list, self._max_cutoff,
                                  self._get_cutoff, triplet, bonding_only)


[docs]
    def bond_angles(self, triplet=None):
        """Compute bond angle statistics.

        Parameters
        ----------
        triplet : str, optional
            Triplet to analyse, e.g. "O-Si-O". If None, all triplets computed.

        Returns
        -------
        dict
            Keyed by triplet string. Each value is a dict with
            "mean", "std", "min", "max", "count" (angles in degrees).
        """
        raw = self._compute_all_angles(triplet)
        return compute_bond_angle_stats(raw)


    # ── RDF and S(q) ────────────────────────────────────────────────────


[docs]
    def rdf(self, pair=None, rmax=None, nbins=200, sigma=0.0):
        """Compute the radial distribution function g(r).

        Parameters
        ----------
        pair : str, optional
            Pair to analyse, e.g. "Si-O". If None, total RDF.
        rmax : float, optional
            Maximum radius in A. Auto-detected from cell if None.
        nbins : int
            Number of histogram bins (default 200).
        sigma : float
            Gaussian smearing width in A (default 0.0 = no smearing).
            Use 0.02-0.05 for comparison with experiment.

        Returns
        -------
        dict
            {"r": list[float], "g_r": list[float]}
        """
        return compute_rdf(self.atoms_list, pair, rmax, nbins, sigma=sigma)



[docs]
    def structure_factor(self, pair=None, qmax=15.0, nq=300, rmax=None):
        """Compute the structure factor S(q) from g(r) via Fourier transform.

        Parameters
        ----------
        pair : str, optional
            Pair to analyse. If None, total S(q).
        qmax : float
            Maximum q in 1/A (default 15.0).
        nq : int
            Number of q points (default 300).
        rmax : float, optional
            Maximum radius for RDF used in the transform.

        Returns
        -------
        dict
            {"q": list[float], "s_q": list[float]}
        """
        return compute_structure_factor(self.atoms_list, pair, qmax, nq, rmax)



[docs]
    def averaged_rdf(self, pair=None, rmax=None, nbins=200):
        """Compute RDF per structure with mean and standard deviation.

        Parameters
        ----------
        pair : str, optional
            Pair to analyse, e.g. "Si-O". If None, total RDF.
        rmax : float, optional
            Maximum radius in A. Auto-detected from cell if None.
        nbins : int
            Number of histogram bins (default 200).

        Returns
        -------
        dict
            {"r": list, "g_r_mean": list, "g_r_std": list, "n_structures": int}
        """
        return compute_averaged_rdf(self.atoms_list, pair, rmax, nbins)


    # ── Advanced analysis ───────────────────────────────────────────────


[docs]
    def ring_statistics(self, bond_pair=None, cutoff=None, max_ring=12):
        """Compute ring size statistics for network-forming structures.

        Parameters
        ----------
        bond_pair : tuple of str, optional
            Bond pair to trace, e.g. ("Si", "O"). Auto-detected if None.
        cutoff : float or dict, optional
            Bond cutoff. Uses analyser cutoff if None.
        max_ring : int
            Maximum ring size to search (default 12).

        Returns
        -------
        dict
            {"ring_sizes": list, "counts": list, "fractions": list,
             "bond_pair": tuple, "total_rings": int}
        """
        return compute_ring_statistics(
            self.atoms_list, bond_pair, cutoff, max_ring, self._get_cutoff)



[docs]
    def voronoi(self, element=None):
        """Compute Voronoi tessellation indices (n3, n4, n5, n6).

        Parameters
        ----------
        element : str, optional
            Element to analyse. If None, all atoms included.

        Returns
        -------
        dict
            {"indices": list, "distribution": dict, "top_10": list,
             "mean_faces": float, "total_atoms": int}
        """
        return compute_voronoi(self.atoms_list, element)



[docs]
    def energy_ranking(self):
        """Rank structures by potential energy per atom.

        Reads energy from atoms.info or attached calculator.

        Returns
        -------
        dict
            {"energies_per_atom": dict, "ranking": list, "best": int,
             "worst": int, "best_energy": float, "worst_energy": float,
             "spread": float}. Returns "warning" key if no energy data.
        """
        return compute_energy_ranking(self.atoms_list)


    # ── Multi-structure averaging ───────────────────────────────────────


[docs]
    def averaged_cn(self, pair=None):
        """Compute per-structure mean CN with overall mean and std.

        Parameters
        ----------
        pair : str, optional
            Pair to analyse, e.g. "Si-O". If None, all pairs computed.

        Returns
        -------
        dict
            Keyed by pair string. Each value is a dict with
            "mean_per_structure", "overall_mean", "overall_std", "n_structures".
        """
        cn_per_structure = defaultdict(list)

        for atoms in self.atoms_list:
            nbr_dict, syms = self._build_neighbour_dict(atoms)
            unique = sorted(set(syms))
            n = len(atoms)

            for s1 in unique:
                for s2 in unique:
                    if pair is not None:
                        p = pair.split("-")
                        if not ((s1 == p[0] and s2 == p[1]) or
                                (s1 == p[1] and s2 == p[0])):
                            continue
                    key = f"{s1}-{s2}"
                    cns = []
                    for a in range(n):
                        if syms[a] != s1:
                            continue
                        cn = sum(1 for _, sj, _, _ in nbr_dict[a] if sj == s2)
                        cns.append(cn)
                    if cns:
                        cn_per_structure[key].append(np.mean(cns))

        result = {}
        for key, means in cn_per_structure.items():
            means = np.array(means)
            result[key] = {
                "mean_per_structure": means.tolist(),
                "overall_mean": float(np.mean(means)),
                "overall_std": float(np.std(means)),
                "n_structures": len(means),
            }
        return result


    # ── Summary and reporting ───────────────────────────────────────────


[docs]
    def summary(self, show_angles=True):
        lines = []
        bar = "=" * 65

        n_structs = len(self.atoms_list)
        formula = self.atoms_list[0].get_chemical_formula(mode="hill")
        n_atoms = len(self.atoms_list[0])

        lines.append(f"\n{bar}")
        lines.append(f"  Structural Analysis: {formula} ({n_atoms} atoms)")
        lines.append(f"  N structures: {n_structs}")
        lines.append(f"  Cutoff mode: {self._cutoff_mode}")
        if isinstance(self.cutoff, dict):
            for pair in sorted(self.cutoff):
                lines.append(f"    {pair}: {self.cutoff[pair]:.2f} A")
        else:
            lines.append(f"    All pairs: {self.cutoff} A")
        lines.append(bar)

        d = self.density()
        lines.append(f"\n  Density: {d['mean']:.2f} +/- {d['std']:.2f} g/cm3")

        bd = self.bond_distances()
        if bd:
            lines.append(f"\n  Bond distances:")
            lines.append(f"  {'Pair':<10} {'Mean (A)':>10} {'Std':>8} "
                         f"{'Min':>8} {'Max':>8} {'Count':>8}")
            lines.append(f"  {'-'*54}")
            for pair, data in bd.items():
                lines.append(
                    f"  {pair:<10} {data['mean']:>10.3f} {data['std']:>7.3f} "
                    f"{data['min']:>8.3f} {data['max']:>8.3f} "
                    f"{data['count']:>8}")

        cn = self.coordination()
        if cn:
            try:
                from ..pipeline.random_gen import _classify_bond
            except ImportError:
                from amorphgen.pipeline.random_gen import _classify_bond

            bonding_cn = {}
            nonbonded_cn = {}
            for pair, data in cn.items():
                s1, s2 = pair.split("-")
                bt = _classify_bond(s1, s2)
                if bt == "ionic" or (bt == "covalent" and s1 != s2):
                    bonding_cn[pair] = data
                else:
                    nonbonded_cn[pair] = data

            def _fmt(pair, data):
                l1 = (f"  {pair}: mean={data['mean']:.1f} +/- "
                      f"{data['std']:.1f} [{data['min']},{data['max']}]")
                parts = [f"CN={cn}: {pct:.1f}%"
                         for cn, pct in sorted(data["distribution"].items())
                         if pct >= 0.5]
                l2 = "    Distribution: " + ", ".join(parts)
                return [l1, l2]

            if bonding_cn:
                lines.append(f"\n  Bonding coordination numbers:")
                for pair, data in bonding_cn.items():
                    lines.extend(_fmt(pair, data))

            if nonbonded_cn:
                nb = {k: v for k, v in nonbonded_cn.items()
                      if v["mean"] > 0.05}
                if nb:
                    lines.append(f"\n  Non-bonded contacts:")
                    for pair, data in nb.items():
                        lines.extend(_fmt(pair, data))

        if show_angles:
            ba = self.bond_angles()
            if ba:
                lines.append(f"\n  Bond angles:")
                lines.append(f"  {'Triplet':<12} {'Mean (deg)':>10} "
                             f"{'Std':>8} {'Count':>8}")
                lines.append(f"  {'-'*42}")
                for triplet, data in ba.items():
                    lines.append(
                        f"  {triplet:<12} {data['mean']:>10.1f} "
                        f"{data['std']:>7.1f} {data['count']:>8}")

        lines.append(f"\n{bar}\n")

        text = "\n".join(lines)
        print(text)
        return text



[docs]
    def per_structure_summary(self) -> str:
        """
        Analyse each structure individually and produce a comparison table.

        Returns
        -------
        str — formatted table (also printed).
        """
        lines = []
        bar = "=" * 80
        formula = self.atoms_list[0].get_chemical_formula(mode="hill")
        n_atoms = len(self.atoms_list[0])

        lines.append(f"\n{bar}")
        lines.append(f"  Per-Structure Analysis: {formula} ({n_atoms} atoms)")
        lines.append(f"  N structures: {len(self.atoms_list)}")
        lines.append(bar)

        # Determine bonding pairs for CN
        try:
            from ..pipeline.random_gen import _classify_bond
        except ImportError:
            from amorphgen.pipeline.random_gen import _classify_bond

        unique = sorted(set(self.atoms_list[0].get_chemical_symbols()))
        bonding_pairs = []
        for s1 in unique:
            for s2 in unique:
                bt = _classify_bond(s1, s2)
                if bt == "ionic" or (bt == "covalent" and s1 != s2):
                    bonding_pairs.append(f"{s1}-{s2}")

        # If single element, use same-species
        if not bonding_pairs:
            bonding_pairs = [f"{unique[0]}-{unique[0]}"]

        # Header
        cn_headers = [f"CN({p})" for p in bonding_pairs[:3]]
        header = f"  {'#':<5} {'Density':>8} {'E/atom':>10}"
        for h in cn_headers:
            header += f" {h:>10}"
        lines.append(f"\n{header}")
        lines.append(f"  {'-'*(len(header)-2)}")

        # Per-structure data
        all_densities = []
        all_energies = []
        all_cns = {p: [] for p in bonding_pairs[:3]}

        for i, atoms in enumerate(self.atoms_list):
            sa_single = StructureAnalyser([atoms], cutoff=self.cutoff)

            # Density
            d = sa_single.density()
            dens = d["mean"]
            all_densities.append(dens)

            # Energy (per atom, using THIS structure's atom count)
            n_at = len(atoms)
            e_str = ""
            for key in ['energy', 'Energy', 'potential_energy']:
                if key in atoms.info:
                    e = atoms.info[key] / n_at
                    all_energies.append(e)
                    e_str = f"{e:.4f}"
                    break
            if not e_str:
                try:
                    e = atoms.get_potential_energy() / n_at
                    all_energies.append(e)
                    e_str = f"{e:.4f}"
                except Exception:
                    e_str = "N/A"

            # CN for bonding pairs
            cn = sa_single.coordination()
            row = f"  {i:<5} {dens:>8.2f} {e_str:>10}"
            for p in bonding_pairs[:3]:
                cn_val = cn.get(p, {}).get("mean", 0)
                all_cns[p].append(cn_val)
                row += f" {cn_val:>10.1f}"

            lines.append(row)

        # Summary statistics
        lines.append(f"  {'-'*(len(header)-2)}")
        mean_row = f"  {'Mean':<5} {np.mean(all_densities):>8.2f}"
        std_row = f"  {'Std':<5} {np.std(all_densities):>8.2f}"
        if all_energies:
            mean_row += f" {np.mean(all_energies):>10.4f}"
            std_row += f" {np.std(all_energies):>10.4f}"
        else:
            mean_row += f" {'N/A':>10}"
            std_row += f" {'N/A':>10}"
        for p in bonding_pairs[:3]:
            mean_row += f" {np.mean(all_cns[p]):>10.1f}"
            std_row += f" {np.std(all_cns[p]):>10.1f}"
        lines.append(mean_row)
        lines.append(std_row)
        lines.append(f"\n{bar}\n")

        text = "\n".join(lines)
        print(text)
        return text



[docs]
    def save_report(self, filepath, text=None, show_angles=True):
        """Save the summary report to a text file.

        Parameters
        ----------
        filepath : str
            Output file path.
        text : str, optional
            Pre-computed report text. If None, calls summary().
        show_angles : bool
            Include bond angles in the report (default True).
        """
        if text is None:
            text = self.summary(show_angles=show_angles)
        with open(filepath, "w") as f:
            f.write(text)
        print(f"  Report saved: {filepath}")



[docs]
    def plot(self, **kwargs):
        """Generate and save analysis plots (RDF, CN distribution, bond angles).

        Parameters
        ----------
        output_dir : str
            Directory for output files (default ".").
        prefix : str
            Filename prefix (default "analysis").
        rdf_pairs : list of str, optional
            Pairs to plot, e.g. ["Si-O", "O-O"]. If None, all pairs.
        rmax : float, optional
            Maximum radius for RDF. Auto-detected if None.
        save_csv : bool
            Also save raw data as CSV files (default True).
        """
        return plot_analysis(self, **kwargs)