includes/archive.php

<?php
declare(strict_types=1);

require_once __DIR__ . '/core.php';

/*
|--------------------------------------------------------------------------
| Archive library + endpoint
|--------------------------------------------------------------------------
|
| Included from the dashboard:
|   Provides archive readers.
|
| Called directly:
|   Creates, verifies and purges yearly archives.
|
| Archives are stored per Brivacia site, for example:
|   /archives/2025-breat.json
|   /archives/2025-code.json
|
| This prevents a replace/import for one site from overwriting archived data
| belonging to another site.
|
*/

if (brivaciaArchiveIsEndpoint()) {
    loadTranslations();
    runYearArchiveEndpoint();
}


/*
|--------------------------------------------------------------------------
| Archive readers
|--------------------------------------------------------------------------
*/

function brivaciaArchiveIsEndpoint(): bool {
    return realpath($_SERVER['SCRIPT_FILENAME'] ?? '') === realpath(__FILE__);
}

function archiveSafeSiteId(string $site): string {
    $site = strtolower(trim($site));
    $site = preg_replace('/[^a-z0-9_-]+/', '-', $site) ?? '';
    $site = trim($site, '-_');

    return $site !== '' ? $site : 'site';
}

function archiveFileName(int $year, string $site): string {
    return sprintf('%04d-%s.json', $year, archiveSafeSiteId($site));
}

function archiveFileYear(string $file): int {
    preg_match('/^(\d{4})(?:-[a-z0-9_-]+)?\.json$/i', basename($file), $match);

    return isset($match[1]) ? (int)$match[1] : 0;
}

function archiveFileSite(string $file): string {
    preg_match('/^\d{4}-([a-z0-9_-]+)\.json$/i', basename($file), $match);

    return isset($match[1]) ? (string)$match[1] : '';
}

function brivaciaArchiveFiles(): array {
    $files = glob(archiveDir() . '/*.json') ?: [];

    $files = array_values(array_filter($files, static function (string $file): bool {
        // Accept both new per-site archives (2025-site.json) and old legacy
        // archives (2025.json), but ignore summary.json and temporary files.
        return preg_match('/^\d{4}(?:-[a-z0-9_-]+)?\.json$/i', basename($file)) === 1;
    }));

    sort($files, SORT_NATURAL | SORT_FLAG_CASE);

    return $files;
}

function brivaciaArchiveSummaryFile(): string {
    return archiveDir() . '/summary.json';
}

function readYearArchive(int $year): array {
    static $cache = [];

    if (isset($cache[$year])) {
        return $cache[$year];
    }

    $merged = [
        'year' => $year,
        'generated_at' => '',
        'days' => [],
        'pages' => [],
        'countries' => [],
        'referrers' => [],
    ];

    foreach (brivaciaArchiveFiles() as $file) {
        if (archiveFileYear($file) !== $year) {
            continue;
        }

        $archive = readArchiveFile($file);

        foreach (['days', 'pages', 'countries', 'referrers'] as $section) {
            foreach (($archive[$section] ?? []) as $row) {
                $merged[$section][] = $row;
            }
        }

        if (($archive['generated_at'] ?? '') !== '') {
            $merged['generated_at'] = (string)$archive['generated_at'];
        }
    }

    return $cache[$year] = $merged;
}

function readArchiveFile(string $file): array {
    static $cache = [];

    if (isset($cache[$file])) {
        return $cache[$file];
    }

    $json = json_decode((string)file_get_contents($file), true);

    return $cache[$file] = is_array($json) ? $json : [];
}

function archiveYearOverlaps(int $year, ?string $start, ?string $end): bool {
    if ($start === null || $end === null) {
        return true;
    }

    return "$year-12-31" >= $start && "$year-01-01" <= $end;
}

function archiveRows(string $section, ?string $start = null, ?string $end = null): array {
    $rows = [];

    foreach (brivaciaArchiveFiles() as $file) {
        $year = archiveFileYear($file);

        if ($year <= 0 || !archiveYearOverlaps($year, $start, $end)) {
            continue;
        }

        $archive = readArchiveFile($file);

        foreach (($archive[$section] ?? []) as $row) {
            $day = (string)($row['day'] ?? '');

            if ($start !== null && ($day < $start || $day > $end)) {
                continue;
            }

            $rows[] = $row;
        }
    }

    return $rows;
}

function archiveHitsTotals(?string $start = null, ?string $end = null): array {
    $totals = [
        'unique_visitors' => 0,
        'visits' => 0,
        'pageviews' => 0,
        'bots' => 0,
    ];

    if ($start === null && $end === null) {
        return array_merge($totals, archiveSummary()['hits'] ?? []);
    }

    foreach (archiveRows('days', $start, $end) as $row) {
        foreach ($totals as $key => $_) {
            $totals[$key] += (int)($row[$key] ?? 0);
        }
    }

    return $totals;
}

function archiveGroupedViews(string $section, string $keyName, ?string $start = null, ?string $end = null): array {
    if ($start === null && $end === null && in_array($section, ['countries', 'referrers'], true)) {
        return archiveSummary()[$section] ?? [];
    }

    $grouped = [];

    foreach (archiveRows($section, $start, $end) as $row) {
        $key = (string)($row[$keyName] ?? '');

        if ($key === '') {
            continue;
        }

        $grouped[$key] ??= [
            $keyName => $key,
            'views' => 0,
        ];

        $grouped[$key]['views'] += (int)($row['views'] ?? 0);
    }

    $grouped = array_values($grouped);
    usort($grouped, fn($a, $b) => $b['views'] <=> $a['views']);

    return $grouped;
}

function archivePages(?string $start = null, ?string $end = null): array {
    $pages = [];

    foreach (archiveRows('pages', $start, $end) as $row) {
        $site = (string)($row['site'] ?? '');
        $pageKey = (string)($row['page_key'] ?? '');

        if ($site === '' || $pageKey === '') {
            continue;
        }

        $key = $site . "\0" . $pageKey;

        $pages[$key] ??= [
            'site' => $site,
            'page_key' => $pageKey,
            'title' => '',
            'url' => '',
            'views' => 0,
            'page_resolved' => 0,
        ];

        $pages[$key]['views'] += (int)($row['views'] ?? 0);

        $resolved = (int)($row['page_resolved'] ?? 0);
        $title = (string)($row['title'] ?? '');
        $url = (string)($row['url'] ?? '');

        if ($resolved === 1 || $pages[$key]['title'] === '') {
            if ($title !== '') {
                $pages[$key]['title'] = $title;
            }

            if ($url !== '') {
                $pages[$key]['url'] = $url;
            }

            $pages[$key]['page_resolved'] = max(
                $pages[$key]['page_resolved'],
                $resolved
            );
        }
    }

    $pages = array_values($pages);

    usort(
        $pages,
        fn($a, $b) =>
            ($b['views'] <=> $a['views']) ?:
            strcmp((string)$a['title'], (string)$b['title'])
    );

    return $pages;
}


/*
|--------------------------------------------------------------------------
| Archive summary cache
|--------------------------------------------------------------------------
|
| The dashboard needs all-time counters on every page. Reading every yearly
| archive for each card is wasteful, so closed-year totals are cached here.
|
| The summary intentionally aggregates every per-site archive, because the
| current dashboard is global. A future site filter can add a site-aware
| summary without changing the archive file format.
|
*/

function rebuildArchiveSummary(): array {
    $summary = [
        'generated_at' => date('c'),
        'years' => [],
        'sites' => [],
        'hits' => [
            'unique_visitors' => 0,
            'visits' => 0,
            'pageviews' => 0,
            'bots' => 0,
        ],
        'countries' => [],
        'referrers' => [],
    ];

    $countries = [];
    $referrers = [];

    foreach (brivaciaArchiveFiles() as $file) {
        $archive = readArchiveFile($file);
        $year = (int)($archive['year'] ?? archiveFileYear($file));
        $site = (string)($archive['site'] ?? archiveFileSite($file));

        if ($year > 0) {
            $summary['years'][] = $year;
        }

        if ($site !== '') {
            $summary['sites'][] = $site;
        }

        foreach (($archive['days'] ?? []) as $row) {
            foreach ($summary['hits'] as $key => $_) {
                $summary['hits'][$key] += (int)($row[$key] ?? 0);
            }
        }

        foreach (($archive['countries'] ?? []) as $row) {
            $country = (string)($row['country'] ?? '');

            if ($country === '') {
                continue;
            }

            $countries[$country] = ($countries[$country] ?? 0) + (int)($row['views'] ?? 0);
        }

        foreach (($archive['referrers'] ?? []) as $row) {
            $referrer = (string)($row['referrer'] ?? '');

            if ($referrer === '') {
                continue;
            }

            $referrers[$referrer] = ($referrers[$referrer] ?? 0) + (int)($row['views'] ?? 0);
        }
    }

    $summary['years'] = array_values(array_unique($summary['years']));
    sort($summary['years'], SORT_NUMERIC);

    $summary['sites'] = array_values(array_unique($summary['sites']));
    sort($summary['sites'], SORT_NATURAL | SORT_FLAG_CASE);

    foreach ($countries as $country => $views) {
        $summary['countries'][] = [
            'country' => $country,
            'views' => $views,
        ];
    }

    foreach ($referrers as $referrer => $views) {
        $summary['referrers'][] = [
            'referrer' => $referrer,
            'views' => $views,
        ];
    }

    usort($summary['countries'], fn($a, $b) => $b['views'] <=> $a['views']);
    usort($summary['referrers'], fn($a, $b) => $b['views'] <=> $a['views']);

    $file = brivaciaArchiveSummaryFile();
    $tmp = $file . '.tmp';

    file_put_contents(
        $tmp,
        json_encode(
            $summary,
            JSON_PRETTY_PRINT |
            JSON_UNESCAPED_UNICODE |
            JSON_UNESCAPED_SLASHES
        ),
        LOCK_EX
    );

    rename($tmp, $file);

    return $summary;
}

function archiveSummary(): array {
    static $summary = null;

    if (is_array($summary)) {
        return $summary;
    }

    $file = brivaciaArchiveSummaryFile();

    if (is_file($file)) {
        $json = json_decode((string)file_get_contents($file), true);

        if (is_array($json)) {
            return $summary = $json;
        }
    }

    return $summary = rebuildArchiveSummary();
}


/*
|--------------------------------------------------------------------------
| Archive endpoint
|--------------------------------------------------------------------------
*/

function runYearArchiveEndpoint(): void {
    $db = brivaciaDb();
    $year = (int)($_GET['year'] ?? ((int)date('Y') - 1));
    $requestedSite = trim((string)($_GET['site'] ?? ''));

    if ($year < 2020 || $year >= (int)date('Y')) {
        brivaciaLog('archive/invalid.log', 'invalid year=' . $year);

        http_response_code(400);
        exit(t('archive.invalid.year'));
    }

    try {
        brivaciaLog(
            'archive/year.log',
            'start year=' . $year . ' site=' . ($requestedSite !== '' ? $requestedSite : 'all')
        );

        $sites = $requestedSite !== ''
            ? [$requestedSite]
            : archiveSitesForYear($db, $year);

        $files = [];

        foreach ($sites as $site) {
            $file = createYearArchive($db, $year, $site);
            verifyYearArchive($file, $year, $site);
            purgeArchivedYear($db, $year, $site);

            $files[] = basename($file);

            brivaciaLog(
                'archive/year.log',
                'success year=' . $year . ' site=' . $site . ' file=' . basename($file)
            );
        }

        rebuildArchiveSummary();

    } catch (Throwable $e) {
        brivaciaLog(
            'archive/failed.log',
            'failed year=' . $year . ' site=' . ($requestedSite !== '' ? $requestedSite : 'all') . ' error=' . $e->getMessage()
        );

        http_response_code(500);
        exit(t('archive.failed'));
    }

    header('Content-Type: text/plain; charset=utf-8');

    echo t('archive.created') . ' : ' . implode(', ', $files) . PHP_EOL;
    echo t('archive.sqlite.purged') . " $year";

    exit;
}

function archiveSitesForYear(PDO $db, int $year): array {
    $start = sprintf('%04d-01-01', $year);
    $end = sprintf('%04d-12-31', $year);

    $rows = fetchAll($db, '
        SELECT DISTINCT site
        FROM hits_daily
        WHERE day BETWEEN ? AND ?
        AND site != ""
        ORDER BY site
    ', [$start, $end]);

    return array_values(array_filter(array_map(
        static fn(array $row): string => (string)($row['site'] ?? ''),
        $rows
    )));
}

function createYearArchive(PDO $db, int $year, string $site): string {
    $site = trim($site);
    $start = sprintf('%04d-01-01', $year);
    $end = sprintf('%04d-12-31', $year);

    if ($site === '') {
        throw new RuntimeException('Archive site is missing');
    }

    /*
    |--------------------------------------------------------------------------
    | Resolve important page labels before freezing a closed year
    |--------------------------------------------------------------------------
    |
    | Archive files are JSON snapshots. Once a year is archived, its SQLite rows
    | are purged, so raw imported page labels would otherwise stay raw forever in
    | the "all time" dashboard view.
    |
    */

    refreshPageLabelsForRange(
        $db,
        'site = ? AND day BETWEEN ? AND ?',
        [$site, $start, $end],
        100
    );

    $pages = fetchAll(
        $db,
        'SELECT * FROM pages_daily WHERE site = ? AND day BETWEEN ? AND ? ORDER BY day, views DESC',
        [$site, $start, $end]
    );

    foreach ($pages as &$page) {
        $pageSite = (string)($page['site'] ?? $site);
        $pageKey = (string)($page['page_key'] ?? '');
        $rawTitle = (string)($page['title'] ?? '');
        $rawUrl = (string)($page['url'] ?? '');
        $displayUrl = pageUrl($pageSite, $pageKey, $rawUrl);

        $page['url'] = $displayUrl;
        $page['title'] = cleanPageTitle(
            $rawTitle !== '' ? $rawTitle : $pageKey,
            $pageSite,
            $pageKey,
            $displayUrl
        );
    }
    unset($page);

    $archive = [
        'year' => $year,
        'site' => $site,
        'generated_at' => date('c'),
        'days' => fetchAll(
            $db,
            'SELECT * FROM hits_daily WHERE site = ? AND day BETWEEN ? AND ? ORDER BY day',
            [$site, $start, $end]
        ),
        'pages' => $pages,
        'countries' => fetchAll(
            $db,
            'SELECT * FROM countries_daily WHERE site = ? AND day BETWEEN ? AND ? ORDER BY day, views DESC',
            [$site, $start, $end]
        ),
        'referrers' => fetchAll(
            $db,
            'SELECT * FROM referrers_daily WHERE site = ? AND day BETWEEN ? AND ? ORDER BY day, views DESC',
            [$site, $start, $end]
        ),
    ];

    $file = archiveDir() . '/' . archiveFileName($year, $site);
    $tmp = $file . '.tmp';

    file_put_contents(
        $tmp,
        json_encode(
            $archive,
            JSON_PRETTY_PRINT |
            JSON_UNESCAPED_UNICODE |
            JSON_UNESCAPED_SLASHES
        ),
        LOCK_EX
    );

    if (!is_file($tmp) || filesize($tmp) < 50) {
        throw new RuntimeException('Archive write failed');
    }

    rename($tmp, $file);

    return $file;
}

function verifyYearArchive(string $file, int $year, string $site = ''): void {
    if (!is_file($file) || filesize($file) < 50) {
        throw new RuntimeException('Archive file missing or empty');
    }

    $json = json_decode((string)file_get_contents($file), true);

    if (
        !is_array($json) ||
        ($json['year'] ?? null) !== $year ||
        ($site !== '' && (string)($json['site'] ?? '') !== $site) ||
        !isset($json['generated_at']) ||
        !isset($json['days'], $json['pages'], $json['countries'], $json['referrers']) ||
        !is_array($json['days']) ||
        !is_array($json['pages']) ||
        !is_array($json['countries']) ||
        !is_array($json['referrers'])
    ) {
        throw new RuntimeException('Archive verification failed');
    }
}

function purgeArchivedYear(PDO $db, int $year, string $site): void {
    $site = trim($site);
    $start = sprintf('%04d-01-01', $year);
    $end = sprintf('%04d-12-31', $year);

    if ($site === '') {
        throw new RuntimeException('Archive purge site is missing');
    }

    $tables = [
        'hits_daily',
        'pages_daily',
        'countries_daily',
        'referrers_daily',
        'seen_daily',
        'visitor_sessions',
    ];

    $db->beginTransaction();

    try {
        foreach ($tables as $table) {
            $db->prepare("DELETE FROM $table WHERE site = ? AND day BETWEEN ? AND ?")
                ->execute([$site, $start, $end]);
        }

        $db->commit();
        $db->exec('VACUUM');

    } catch (Throwable $e) {
        $db->rollBack();

        throw $e;
    }
}

function archiveClosedYears(PDO $db): array {
    $currentYear = (int)date('Y');
    $rows = fetchAll($db, '
        SELECT
            site,
            CAST(substr(day, 1, 4) AS INTEGER) AS year
        FROM hits_daily
        WHERE CAST(substr(day, 1, 4) AS INTEGER) < ?
        AND site != ""
        GROUP BY site, CAST(substr(day, 1, 4) AS INTEGER)
        ORDER BY year, site
    ', [$currentYear]);

    $archived = [];

    foreach ($rows as $row) {
        $site = (string)($row['site'] ?? '');
        $year = (int)($row['year'] ?? 0);

        if ($site === '' || $year < 2020 || $year >= $currentYear) {
            continue;
        }

        brivaciaLog('archive/year.log', 'auto start year=' . $year . ' site=' . $site);

        $file = createYearArchive($db, $year, $site);
        verifyYearArchive($file, $year, $site);
        purgeArchivedYear($db, $year, $site);

        brivaciaLog('archive/year.log', 'auto success year=' . $year . ' site=' . $site);

        $archived[] = $year . '-' . $site;
    }

    if ($archived !== []) {
        rebuildArchiveSummary();
    }

    return $archived;
}