mirror of
https://git.friendi.ca/friendica/friendica-addons.git
synced 2025-07-11 19:08:49 +00:00
CLD: New plugin for language detection via CLD2
This commit is contained in:
parent
a5ed02ed23
commit
981e6821d0
2 changed files with 166 additions and 0 deletions
75
cld/cld.php
Normal file
75
cld/cld.php
Normal file
|
@ -0,0 +1,75 @@
|
|||
<?php
|
||||
/**
|
||||
* Name: Compact Language Detector
|
||||
* Description: Improved language detection
|
||||
* Version: 0.1
|
||||
* Author: Michael Vogel <heluecht@pirati.ca>
|
||||
*/
|
||||
|
||||
use Friendica\Core\Hook;
|
||||
use Friendica\Core\Logger;
|
||||
use Friendica\DI;
|
||||
|
||||
function cld_install()
|
||||
{
|
||||
Hook::register('get_language', 'addon/cld/cld.php', 'cld_get_language');
|
||||
}
|
||||
|
||||
function cld_get_language(array &$data)
|
||||
{
|
||||
if (!in_array('cld2', get_loaded_extensions())) {
|
||||
Logger::warning('CLD2 is not installed.');
|
||||
return;
|
||||
}
|
||||
|
||||
$cld2 = new \CLD2Detector();
|
||||
|
||||
$cld2->setEncodingHint(CLD2Encoding::UTF8); // optional, hints about text encoding
|
||||
|
||||
$result = $cld2->detect($data['text']);
|
||||
|
||||
if ($data['detected']) {
|
||||
$original = array_key_first($data['detected']);
|
||||
} else {
|
||||
$original = '';
|
||||
}
|
||||
|
||||
$detected = $result['language_code'];
|
||||
if ($detected == 'pt') {
|
||||
$detected = 'pt-PT';
|
||||
} elseif ($detected == 'el') {
|
||||
$detected = 'el-monoton';
|
||||
} elseif ($detected == 'no') {
|
||||
$detected = 'nb';
|
||||
} elseif ($detected == 'zh') {
|
||||
$detected = 'zh-Hans';
|
||||
} elseif ($detected == 'zh-Hant') {
|
||||
$detected = 'zh-hant';
|
||||
}
|
||||
|
||||
if (!$result['is_reliable']) {
|
||||
Logger::debug('Unreliable detection', ['original' => $original, 'detected' => $detected, 'name' => $result['language_name'], 'probability' => $result['language_probability'], 'text' => $data['text']]);
|
||||
return;
|
||||
}
|
||||
|
||||
if ($original == $detected) {
|
||||
// return;
|
||||
}
|
||||
|
||||
// Nur aus Testgründen
|
||||
if (in_array($detected, ['xx-Qaai', 'ht', 'ga'])) {
|
||||
return;
|
||||
}
|
||||
|
||||
$available = array_keys(DI::l10n()->convertForLanguageDetection(DI::l10n()->getAvailableLanguages(true)));
|
||||
|
||||
if (!in_array($detected, $available)) {
|
||||
Logger::debug('Unsupported language', ['original' => $original, 'detected' => $detected, 'name' => $result['language_name'], 'probability' => $result['language_probability'], 'text' => $data['text']]);
|
||||
return;
|
||||
}
|
||||
|
||||
Logger::debug('Detected', ['original' => $original, 'detected' => $detected, 'name' => $result['language_name'], 'probability' => $result['language_probability'], 'text' => $data['text']]);
|
||||
|
||||
// Logger::debug('Detected different language', ['original' => $original, 'detected' => $detected, 'name' => $result['language_name'], 'probability' => $result['language_probability'], 'text' => $data['text']]);
|
||||
$data['detected'] = [$detected => $result['language_probability'] / 100];
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue