includes/rules.php

<?php
declare(strict_types=1);

/*
|--------------------------------------------------------------------------
| Page identifiers
|--------------------------------------------------------------------------
*/

function brivaciaPageKey(string $site, string $page, string $trackedWebsiteLang = ''): string {
    return $trackedWebsiteLang !== ''
        ? $trackedWebsiteLang . ':' . $page
        : $page;
}

function brivaciaRawPageKey(string $key): string {
    return str_contains($key, ':')
        ? substr($key, strpos($key, ':') + 1)
        : $key;
}


/*
|--------------------------------------------------------------------------
| Tracked website language
|--------------------------------------------------------------------------
|
| Tries to infer the tracked website language from a page URL or referrer.
| This is only about the tracked website, not the Brivacia dashboard language.
|
*/

function brivaciaInferTrackedWebsiteLang(string $url, string $ref = '', string $fallback = ''): string {
    $fallback = strtolower(substr(trim($fallback), 0, 10));

    if ($fallback !== '') {
        return $fallback;
    }

    foreach ([$url, $ref] as $source) {
        $path = parse_url($source, PHP_URL_PATH);

        if (!is_string($path) || $path === '') {
            continue;
        }

        $first = trim(explode('/', trim($path, '/'))[0] ?? '');

        if (preg_match('~^[a-z]{2}(?:-[a-z]{2})?$~i', $first)) {
            return strtolower($first);
        }
    }

    return '';
}


/*
|--------------------------------------------------------------------------
| Page URLs
|--------------------------------------------------------------------------
*/

function brivaciaPageUrl(string $site, string $pageKey, string $url = ''): string {
    $host = siteLabel($site);

    if ($url !== '') {
        if (preg_match('~^https?://~i', $url)) {
            return $url;
        }

        return 'https://' . $host . '/' . ltrim($url, '/');
    }

    if (str_starts_with($pageKey, '/')) {
        return 'https://' . $host . $pageKey;
    }

    return 'https://' . $host . '/' . ltrim($pageKey, '/');
}


/*
|--------------------------------------------------------------------------
| Page titles
|--------------------------------------------------------------------------
*/

function brivaciaCleanPageTitle(string $title, string $site): string {
    return trim($title);
}


/*
|--------------------------------------------------------------------------
| Metadata extraction
|--------------------------------------------------------------------------
*/

function brivaciaExtractHtmlTitle(string $html): string {
    if (!preg_match('~<title[^>]*>(.*?)</title>~is', $html, $m)) {
        return '';
    }

    return html_entity_decode(
        strip_tags(trim($m[1])),
        ENT_QUOTES,
        'UTF-8'
    );
}

function brivaciaExtractCanonicalUrl(string $html): string {
    if (preg_match_all('~<link\b[^>]*>~i', $html, $links)) {
        foreach ($links[0] as $tag) {
            if (!preg_match('~\brel=["\']([^"\']+)["\']~i', $tag, $rel)) continue;
            if (!str_contains(strtolower($rel[1]), 'canonical')) continue;
            if (!preg_match('~\bhref=["\']([^"\']+)["\']~i', $tag, $href)) continue;

            return html_entity_decode(trim($href[1]), ENT_QUOTES, 'UTF-8');
        }
    }

    if (preg_match_all('~<meta\b[^>]*>~i', $html, $tags)) {
        foreach ($tags[0] as $tag) {
            $lower = strtolower($tag);
            if (!str_contains($lower, 'og:url')) continue;
            if (!preg_match('~\bcontent=["\']([^"\']+)["\']~i', $tag, $m)) continue;

            return html_entity_decode(trim($m[1]), ENT_QUOTES, 'UTF-8');
        }
    }

    return '';
}

function brivaciaPathFromAbsoluteOrRelativeUrl(string $url): string {
    $path = parse_url($url, PHP_URL_PATH);

    if (is_string($path) && $path !== '') {
        $query = parse_url($url, PHP_URL_QUERY);
        return $path . (is_string($query) && $query !== '' ? '?' . $query : '');
    }

    return $url !== '' && str_starts_with($url, '/') ? $url : '';
}


/*
|--------------------------------------------------------------------------
| Metadata fallback detection
|--------------------------------------------------------------------------
*/

function brivaciaShouldLogPageFallback(string $key, string $title, string $url): bool {
    $rawKey = brivaciaRawPageKey($key);

    return $title === ''
        || $title === $rawKey
        || $url === ''
        || $url === '/' . $rawKey
        || str_contains($title, $rawKey);
}


/*
|--------------------------------------------------------------------------
| Metadata resolution
|--------------------------------------------------------------------------
*/

function brivaciaResolvePageMetadata(string $site, string $pageKey, string $url = ''): array {
    $candidateUrl = brivaciaPageUrl($site, $pageKey, $url);
    $html = @file_get_contents($candidateUrl, false, brivaciaHttpContext(3));

    if (!$html) {
        return ['title' => '', 'url' => ''];
    }

    $title = brivaciaExtractHtmlTitle($html);
    $canonical = brivaciaExtractCanonicalUrl($html);
    $path = brivaciaPathFromAbsoluteOrRelativeUrl($canonical);

    if ($path === '') {
        $path = brivaciaPathFromAbsoluteOrRelativeUrl($candidateUrl);
    }

    if (brivaciaShouldLogPageFallback($pageKey, $title, $path)) {
        return ['title' => '', 'url' => ''];
    }

    return [
        'title' => $title,
        'url' => $path,
    ];
}