composer update pear/text_languagedetect

This commit is contained in:
Mario Vavti
2020-08-22 20:01:59 +02:00
parent 88a68f96da
commit 646dce7765
18 changed files with 256 additions and 39 deletions

View File

@@ -11,7 +11,7 @@
|lukasreschke/id3parser|0.0.3.0|GPL|https://github.com/LukasReschke/ID3Parser.git|
|michelf/php-markdown|1.9.0.0|BSD-3-Clause|https://github.com/michelf/php-markdown.git|
|paragonie/random_compat|9.99.99.0|MIT|https://github.com/paragonie/random_compat.git|
|pear/text_languagedetect|1.0.0.0|BSD-2-Clause|https://github.com/pear/Text_LanguageDetect.git|
|pear/text_languagedetect|1.0.1.0|BSD-2-Clause|https://github.com/pear/Text_LanguageDetect.git|
|psr/log|1.1.3.0|MIT|https://github.com/php-fig/log.git|
|ramsey/uuid|3.9.3.0|MIT|https://github.com/ramsey/uuid.git|
|sabre/dav|4.1.1.0|BSD-3-Clause|https://github.com/sabre-io/dav.git|

12
composer.lock generated
View File

@@ -472,20 +472,20 @@
},
{
"name": "pear/text_languagedetect",
"version": "v1.0.0",
"version": "v1.0.1",
"source": {
"type": "git",
"url": "https://github.com/pear/Text_LanguageDetect.git",
"reference": "bb9ff6f4970f686fac59081e916b456021fe7ba6"
"reference": "9e253f26cef9a9066f53f200cc3e0684018cb5b5"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/pear/Text_LanguageDetect/zipball/bb9ff6f4970f686fac59081e916b456021fe7ba6",
"reference": "bb9ff6f4970f686fac59081e916b456021fe7ba6",
"url": "https://api.github.com/repos/pear/Text_LanguageDetect/zipball/9e253f26cef9a9066f53f200cc3e0684018cb5b5",
"reference": "9e253f26cef9a9066f53f200cc3e0684018cb5b5",
"shasum": ""
},
"require-dev": {
"phpunit/phpunit": "*"
"phpunit/phpunit": "8.*|9.*"
},
"suggest": {
"ext-mbstring": "May require the mbstring PHP extension"
@@ -512,7 +512,7 @@
],
"description": "Identify human languages from text samples",
"homepage": "http://pear.php.net/package/Text_LanguageDetect",
"time": "2017-03-02T16:14:08+00:00"
"time": "2020-05-17T12:19:40+00:00"
},
{
"name": "psr/log",

View File

@@ -970,8 +970,10 @@ return array(
'TPC_yyStackEntry' => $vendorDir . '/smarty/smarty/libs/sysplugins/smarty_internal_configfileparser.php',
'TP_yyStackEntry' => $vendorDir . '/smarty/smarty/libs/sysplugins/smarty_internal_templateparser.php',
'Text_LanguageDetect' => $vendorDir . '/pear/text_languagedetect/Text/LanguageDetect.php',
'Text_LanguageDetectTest' => $vendorDir . '/pear/text_languagedetect/tests/Text_LanguageDetectTest.php',
'Text_LanguageDetect_Exception' => $vendorDir . '/pear/text_languagedetect/Text/LanguageDetect/Exception.php',
'Text_LanguageDetect_ISO639' => $vendorDir . '/pear/text_languagedetect/Text/LanguageDetect/ISO639.php',
'Text_LanguageDetect_ISO639Test' => $vendorDir . '/pear/text_languagedetect/tests/Text_LanguageDetect_ISO639Test.php',
'Text_LanguageDetect_Parser' => $vendorDir . '/pear/text_languagedetect/Text/LanguageDetect/Parser.php',
'UploadHandler' => $vendorDir . '/blueimp/jquery-file-upload/server/php/UploadHandler.php',
'Zotlabs\\Access\\AccessList' => $baseDir . '/Zotlabs/Access/AccessList.php',

View File

@@ -1138,8 +1138,10 @@ class ComposerStaticInit7b34d7e50a62201ec5d5e526a5b8b35d
'TPC_yyStackEntry' => __DIR__ . '/..' . '/smarty/smarty/libs/sysplugins/smarty_internal_configfileparser.php',
'TP_yyStackEntry' => __DIR__ . '/..' . '/smarty/smarty/libs/sysplugins/smarty_internal_templateparser.php',
'Text_LanguageDetect' => __DIR__ . '/..' . '/pear/text_languagedetect/Text/LanguageDetect.php',
'Text_LanguageDetectTest' => __DIR__ . '/..' . '/pear/text_languagedetect/tests/Text_LanguageDetectTest.php',
'Text_LanguageDetect_Exception' => __DIR__ . '/..' . '/pear/text_languagedetect/Text/LanguageDetect/Exception.php',
'Text_LanguageDetect_ISO639' => __DIR__ . '/..' . '/pear/text_languagedetect/Text/LanguageDetect/ISO639.php',
'Text_LanguageDetect_ISO639Test' => __DIR__ . '/..' . '/pear/text_languagedetect/tests/Text_LanguageDetect_ISO639Test.php',
'Text_LanguageDetect_Parser' => __DIR__ . '/..' . '/pear/text_languagedetect/Text/LanguageDetect/Parser.php',
'UploadHandler' => __DIR__ . '/..' . '/blueimp/jquery-file-upload/server/php/UploadHandler.php',
'Zotlabs\\Access\\AccessList' => __DIR__ . '/../..' . '/Zotlabs/Access/AccessList.php',

View File

@@ -483,26 +483,26 @@
},
{
"name": "pear/text_languagedetect",
"version": "v1.0.0",
"version_normalized": "1.0.0.0",
"version": "v1.0.1",
"version_normalized": "1.0.1.0",
"source": {
"type": "git",
"url": "https://github.com/pear/Text_LanguageDetect.git",
"reference": "bb9ff6f4970f686fac59081e916b456021fe7ba6"
"reference": "9e253f26cef9a9066f53f200cc3e0684018cb5b5"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/pear/Text_LanguageDetect/zipball/bb9ff6f4970f686fac59081e916b456021fe7ba6",
"reference": "bb9ff6f4970f686fac59081e916b456021fe7ba6",
"url": "https://api.github.com/repos/pear/Text_LanguageDetect/zipball/9e253f26cef9a9066f53f200cc3e0684018cb5b5",
"reference": "9e253f26cef9a9066f53f200cc3e0684018cb5b5",
"shasum": ""
},
"require-dev": {
"phpunit/phpunit": "*"
"phpunit/phpunit": "8.*|9.*"
},
"suggest": {
"ext-mbstring": "May require the mbstring PHP extension"
},
"time": "2017-03-02T16:14:08+00:00",
"time": "2020-05-17T12:19:40+00:00",
"type": "library",
"installation-source": "dist",
"autoload": {

View File

@@ -0,0 +1,6 @@
# composer related
composer.lock
composer.phar
vendor
/README.html
/dist

View File

@@ -0,0 +1,14 @@
language: php
sudo: false
php:
- 7.2
- 7.3
- 7.4
install:
- pear install pear/PHP_CodeSniffer
- composer install
- phpenv rehash
script:
- composer validate
- ./vendor/bin/phpunit --coverage-text tests
- phpcs Text/

View File

@@ -155,3 +155,12 @@ Unit test status
.. image:: https://travis-ci.org/pear/Text_LanguageDetect.svg?branch=master
:target: https://travis-ci.org/pear/Text_LanguageDetect
Notes
=====
Where are the data from?
I don't recall where I got the original data set.
It's just the frequencies of 3-letter combinations in each supported language.
It could be generated from a few random wikipedia pages from each language.

View File

@@ -12,9 +12,9 @@
* @link http://pear.php.net/package/Text_LanguageDetect/
*/
//require_once 'Text/LanguageDetect/Exception.php';
//require_once 'Text/LanguageDetect/Parser.php';
//require_once 'Text/LanguageDetect/ISO639.php';
require_once 'Text/LanguageDetect/Exception.php';
require_once 'Text/LanguageDetect/Parser.php';
require_once 'Text/LanguageDetect/ISO639.php';
/**
* Detects the language of a given piece of text.
@@ -189,7 +189,7 @@ class Text_LanguageDetect
*/
protected function _get_data_loc($fname)
{
if ($fname{0} == '/' || $fname{0} == '.') {
if ($fname[0] == '/' || $fname[0] == '.') {
// if filename starts with a slash, assume it's an absolute pathname
// and skip whatever is in $this->_data_dir
return $fname;
@@ -247,12 +247,6 @@ class Text_LanguageDetect
protected function _checkTrigram($trigram)
{
if (!is_array($trigram)) {
if (ini_get('magic_quotes_runtime')) {
throw new Text_LanguageDetect_Exception(
'Error loading database. Try turning magic_quotes_runtime off.',
Text_LanguageDetect_Exception::MAGIC_QUOTES
);
}
throw new Text_LanguageDetect_Exception(
'Language database is not an array.',
Text_LanguageDetect_Exception::DB_NOT_ARRAY
@@ -1470,31 +1464,31 @@ class Text_LanguageDetect
case 1:
// normal ASCII-7 byte
// 0xxxxxxx --> 0xxxxxxx
return ord($char{0});
return ord($char[0]);
case 2:
// 2 byte unicode
// 110zzzzx 10xxxxxx --> 00000zzz zxxxxxxx
$z = (ord($char{0}) & 0x000001F) << 6;
$x = (ord($char{1}) & 0x0000003F);
$z = (ord($char[0]) & 0x000001F) << 6;
$x = (ord($char[1]) & 0x0000003F);
return ($z | $x);
case 3:
// 3 byte unicode
// 1110zzzz 10zxxxxx 10xxxxxx --> zzzzzxxx xxxxxxxx
$z = (ord($char{0}) & 0x0000000F) << 12;
$x1 = (ord($char{1}) & 0x0000003F) << 6;
$x2 = (ord($char{2}) & 0x0000003F);
$z = (ord($char[0]) & 0x0000000F) << 12;
$x1 = (ord($char[1]) & 0x0000003F) << 6;
$x2 = (ord($char[2]) & 0x0000003F);
return ($z | $x1 | $x2);
case 4:
// 4 byte unicode
// 11110zzz 10zzxxxx 10xxxxxx 10xxxxxx -->
// 000zzzzz xxxxxxxx xxxxxxxx
$z1 = (ord($char{0}) & 0x00000007) << 18;
$z2 = (ord($char{1}) & 0x0000003F) << 12;
$x1 = (ord($char{2}) & 0x0000003F) << 6;
$x2 = (ord($char{3}) & 0x0000003F);
$z1 = (ord($char[0]) & 0x00000007) << 18;
$z2 = (ord($char[1]) & 0x0000003F) << 12;
$x1 = (ord($char[2]) & 0x0000003F) << 6;
$x2 = (ord($char[3]) & 0x0000003F);
return ($z1 | $z2 | $x1 | $x2);
}
}
@@ -1514,7 +1508,7 @@ class Text_LanguageDetect
*/
protected static function _next_char($str, &$counter, $special_convert = false)
{
$char = $str{$counter++};
$char = $str[$counter++];
$ord = ord($char);
// for a description of the utf8 system see
@@ -1538,7 +1532,7 @@ class Text_LanguageDetect
} elseif ($ord >> 5 == 6) { // two-byte char
// multi-byte chars
$nextchar = $str{$counter++}; // get next byte
$nextchar = $str[$counter++]; // get next byte
// lower-casing of non-ascii characters is still incomplete
@@ -1580,12 +1574,12 @@ class Text_LanguageDetect
} elseif ($ord >> 4 == 14) { // three-byte char
// tag on next 2 bytes
return $char . $str{$counter++} . $str{$counter++};
return $char . $str[$counter++] . $str[$counter++];
} elseif ($ord >> 3 == 30) { // four-byte char
// tag on next 3 bytes
return $char . $str{$counter++} . $str{$counter++} . $str{$counter++};
return $char . $str[$counter++] . $str[$counter++] . $str[$counter++];
} else {
// error?

View File

@@ -27,6 +27,6 @@
"ext-mbstring": "May require the mbstring PHP extension"
},
"require-dev": {
"phpunit/phpunit": "*"
"phpunit/phpunit": "8.*|9.*"
}
}

View File

@@ -0,0 +1,18 @@
<?php
require_once 'Text/LanguageDetect.php';
$text = 'Was wäre, wenn ich Ihnen das jetzt sagen würde?';
$ld = new Text_LanguageDetect();
//3 most probable languages
$results = $ld->detect($text, 3);
foreach ($results as $language => $confidence) {
echo $language . ': ' . number_format($confidence, 2) . "\n";
}
//output:
//german: 0.35
//dutch: 0.25
//swedish: 0.20
?>

View File

@@ -0,0 +1,15 @@
<?php
/**
* How to handle errors
*/
require_once 'Text/LanguageDetect.php';
require_once 'Text/LanguageDetect/Exception.php';
try {
$ld = new Text_LanguageDetect();
$lang = $ld->detectSimple('Das ist ein kleiner Text');
echo "Language is: $lang\n";
} catch (Text_LanguageDetect_Exception $e) {
echo 'An error occured! Message: ' . $e . "\n";
}
?>

View File

@@ -0,0 +1,35 @@
<?php
/**
* example usage (CLI)
*
* @package Text_LanguageDetect
* @version CVS: $Id$
*/
require_once 'Text/LanguageDetect.php';
$l = new Text_LanguageDetect;
$stdin = fopen('php://stdin', 'r');
echo "Supported languages:\n";
$langs = $l->getLanguages();
sort($langs);
echo join(', ', $langs);
echo "\ntotal ", count($langs), "\n\n";
while ($line = fgets($stdin)) {
$result = $l->detect($line, 4);
print_r($result);
$blocks = $l->detectUnicodeBlocks($line, true);
print_r($blocks);
}
fclose($stdin);
unset($l);
/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */
?>

View File

@@ -0,0 +1,72 @@
<?php
/**
* example usage (web)
*
* @package Text_LanguageDetect
* @version CVS: $Id$
*/
// browsers will encode multi-byte characters wrong unless they think the page is utf8-encoded
header('Content-type: text/html; charset=utf-8', true);
require_once 'Text/LanguageDetect.php';
$l = new Text_LanguageDetect;
if (isset($_REQUEST['q'])) {
$q = stripslashes($_REQUEST['q']);
}
?>
<html>
<head>
<title>Text_LanguageDetect demonstration</title>
</head>
<body>
<h2>Text_LanguageDetect</h2>
<?
echo "<small>Supported languages:\n";
$langs = $l->getLanguages();
sort($langs);
foreach ($langs as $lang) {
echo ucfirst($lang), ', ';
$i++;
}
echo "<br />total $i</small><br /><br />";
?>
<form method="post">
Enter text to identify language (at least a couple of sentences):<br />
<textarea name="q" wrap="virtual" cols="80" rows="8"><?= $q ?></textarea>
<br />
<input type="submit" value="Submit" />
</form>
<?
if (isset($q) && strlen($q)) {
$len = $l->utf8strlen($q);
if ($len < 20) { // this value picked somewhat arbitrarily
echo "Warning: string not very long ($len chars)<br />\n";
}
$result = $l->detectConfidence($q);
if ($result == null) {
echo "Text_LanguageDetect cannot identify this piece of text. <br /><br />\n";
} else {
echo "Text_LanguageDetect thinks this text is written in <b>{$result['language']}</b> ({$result['similarity']}, {$result['confidence']})<br /><br />\n";
}
$result = $l->detectUnicodeBlocks($q, false);
if (!empty($result)) {
arsort($result);
echo "Unicode blocks present: ", join(', ', array_keys($result)), "\n<br /><br />";
}
}
unset($l);
/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */
?>
</body></html>

View File

@@ -0,0 +1,19 @@
<?php
/**
* Demonstrates how to use ISO language codes.
*
* The "name mode" changes the way languages are accepted and returned.
*/
require_once 'Text/LanguageDetect.php';
$ld = new Text_LanguageDetect();
//will output the ISO 639-1 two-letter language code
// "de"
$ld->setNameMode(2);
echo $ld->detectSimple('Das ist ein kleiner Text') . "\n";
//will output the ISO 639-2 three-letter language code
// "deu"
$ld->setNameMode(3);
echo $ld->detectSimple('Das ist ein kleiner Text') . "\n";
?>

View File

@@ -0,0 +1,11 @@
<?php
/**
* List all supported languages
*/
require_once 'Text/LanguageDetect.php';
$ld = new Text_LanguageDetect();
foreach ($ld->getLanguages() as $lang) {
echo $lang . "\n";
}
?>

View File

@@ -0,0 +1,10 @@
<?php
require_once 'Text/LanguageDetect.php';
$text = 'Was wäre, wenn ich Ihnen das jetzt sagen würde?';
$ld = new Text_LanguageDetect();
$result = $ld->detectSimple($text);
var_dump($result);
//output: german
?>

View File

@@ -0,0 +1,10 @@
<?xml version="1.0"?>
<ruleset name="PEAR-textlanguagedetect">
<rule ref="PEAR">
<!-- we keep the old php4-style variable names for now -->
<exclude name="PEAR.NamingConventions.ValidFunctionName.PublicUnderscore"/>
<exclude name="PEAR.NamingConventions.ValidVariableName.PublicUnderscore"/>
<!-- we keep the method names for BC reasons -->
<exclude name="PEAR.NamingConventions.ValidFunctionName.ScopeNotCamelCaps"/>
</rule>
</ruleset>