includes/rules.php
<?php
declare(strict_types=1);
/*
|--------------------------------------------------------------------------
| Page identifiers
|--------------------------------------------------------------------------
*/
function brivaciaPageKey(string $site, string $page, string $trackedWebsiteLang = ''): string {
return $trackedWebsiteLang !== ''
? $trackedWebsiteLang . ':' . $page
: $page;
}
function brivaciaRawPageKey(string $key): string {
return str_contains($key, ':')
? substr($key, strpos($key, ':') + 1)
: $key;
}
/*
|--------------------------------------------------------------------------
| Tracked website language
|--------------------------------------------------------------------------
|
| Tries to infer the tracked website language from a page URL or referrer.
| This is only about the tracked website, not the Brivacia dashboard language.
|
*/
function brivaciaInferTrackedWebsiteLang(string $url, string $ref = '', string $fallback = ''): string {
$fallback = strtolower(substr(trim($fallback), 0, 10));
if ($fallback !== '') {
return $fallback;
}
foreach ([$url, $ref] as $source) {
$path = parse_url($source, PHP_URL_PATH);
if (!is_string($path) || $path === '') {
continue;
}
$first = trim(explode('/', trim($path, '/'))[0] ?? '');
if (preg_match('~^[a-z]{2}(?:-[a-z]{2})?$~i', $first)) {
return strtolower($first);
}
}
return '';
}
/*
|--------------------------------------------------------------------------
| Page URLs
|--------------------------------------------------------------------------
*/
function brivaciaPageUrl(string $site, string $pageKey, string $url = ''): string {
$host = siteLabel($site);
if ($url !== '') {
if (preg_match('~^https?://~i', $url)) {
return $url;
}
return 'https://' . $host . '/' . ltrim($url, '/');
}
if (str_starts_with($pageKey, '/')) {
return 'https://' . $host . $pageKey;
}
return 'https://' . $host . '/' . ltrim($pageKey, '/');
}
/*
|--------------------------------------------------------------------------
| Page titles
|--------------------------------------------------------------------------
*/
function brivaciaCleanPageTitle(string $title, string $site): string {
return trim($title);
}
/*
|--------------------------------------------------------------------------
| Metadata extraction
|--------------------------------------------------------------------------
*/
function brivaciaExtractHtmlTitle(string $html): string {
if (!preg_match('~<title[^>]*>(.*?)</title>~is', $html, $m)) {
return '';
}
return html_entity_decode(
strip_tags(trim($m[1])),
ENT_QUOTES,
'UTF-8'
);
}
function brivaciaExtractCanonicalUrl(string $html): string {
if (preg_match_all('~<link\b[^>]*>~i', $html, $links)) {
foreach ($links[0] as $tag) {
if (!preg_match('~\brel=["\']([^"\']+)["\']~i', $tag, $rel)) continue;
if (!str_contains(strtolower($rel[1]), 'canonical')) continue;
if (!preg_match('~\bhref=["\']([^"\']+)["\']~i', $tag, $href)) continue;
return html_entity_decode(trim($href[1]), ENT_QUOTES, 'UTF-8');
}
}
if (preg_match_all('~<meta\b[^>]*>~i', $html, $tags)) {
foreach ($tags[0] as $tag) {
$lower = strtolower($tag);
if (!str_contains($lower, 'og:url')) continue;
if (!preg_match('~\bcontent=["\']([^"\']+)["\']~i', $tag, $m)) continue;
return html_entity_decode(trim($m[1]), ENT_QUOTES, 'UTF-8');
}
}
return '';
}
function brivaciaPathFromAbsoluteOrRelativeUrl(string $url): string {
$path = parse_url($url, PHP_URL_PATH);
if (is_string($path) && $path !== '') {
$query = parse_url($url, PHP_URL_QUERY);
return $path . (is_string($query) && $query !== '' ? '?' . $query : '');
}
return $url !== '' && str_starts_with($url, '/') ? $url : '';
}
/*
|--------------------------------------------------------------------------
| Metadata fallback detection
|--------------------------------------------------------------------------
*/
function brivaciaShouldLogPageFallback(string $key, string $title, string $url): bool {
$rawKey = brivaciaRawPageKey($key);
return $title === ''
|| $title === $rawKey
|| $url === ''
|| $url === '/' . $rawKey
|| str_contains($title, $rawKey);
}
/*
|--------------------------------------------------------------------------
| Metadata resolution
|--------------------------------------------------------------------------
*/
function brivaciaResolvePageMetadata(string $site, string $pageKey, string $url = ''): array {
$candidateUrl = brivaciaPageUrl($site, $pageKey, $url);
$html = @file_get_contents($candidateUrl, false, brivaciaHttpContext(3));
if (!$html) {
return ['title' => '', 'url' => ''];
}
$title = brivaciaExtractHtmlTitle($html);
$canonical = brivaciaExtractCanonicalUrl($html);
$path = brivaciaPathFromAbsoluteOrRelativeUrl($canonical);
if ($path === '') {
$path = brivaciaPathFromAbsoluteOrRelativeUrl($candidateUrl);
}
if (brivaciaShouldLogPageFallback($pageKey, $title, $path)) {
return ['title' => '', 'url' => ''];
}
return [
'title' => $title,
'url' => $path,
];
}