Commit 02dab418 authored by Pol Dellaiera's avatar Pol Dellaiera Committed by Arkadiusz Kondas
Browse files

Provide a new NGramTokenizer with minGram and maxGram support (#350)

* Issue #349: Provide a new NGramTokenizer.

* Issue #349: Add tests.

* Fixes from code review.

* Implement NGramTokenizer with min and max gram support

* Add missing tests for ngram

* Add info about NGramTokenizer to docs and readme

* Add performance test for tokenization
parent b3fe9dae
......@@ -102,6 +102,9 @@ Public datasets are available in a separate repository [php-ai/php-ml-datasets](
* [Imputation missing values](http://php-ml.readthedocs.io/en/latest/machine-learning/preprocessing/imputation-missing-values/)
* Feature Extraction
* [Token Count Vectorizer](http://php-ml.readthedocs.io/en/latest/machine-learning/feature-extraction/token-count-vectorizer/)
* NGramTokenizer
* WhitespaceTokenizer
* WordTokenizer
* [Tf-idf Transformer](http://php-ml.readthedocs.io/en/latest/machine-learning/feature-extraction/tf-idf-transformer/)
* Dimensionality Reduction
* PCA (Principal Component Analysis)
......
......@@ -53,3 +53,21 @@ $vectorizer->getVocabulary();
* WhitespaceTokenizer - select tokens by whitespace.
* WordTokenizer - select tokens of 2 or more alphanumeric characters (punctuation is completely ignored and always treated as a token separator).
* NGramTokenizer - continuous sequence of characters of the specified length. They are useful for querying languages that don’t use spaces or that have long compound words, like German.
**NGramTokenizer**
The NGramTokenizer tokenizer accepts the following parameters:
`$minGram` - minimum length of characters in a gram. Defaults to 1.
`$maxGram` - maximum length of characters in a gram. Defaults to 2.
```php
use Phpml\Tokenization\NGramTokenizer;
$tokenizer = new NGramTokenizer(1, 2);
$tokenizer->tokenize('Quick Fox');
// returns ['Q', 'u', 'i', 'c', 'k', 'Qu', 'ui', 'ic', 'ck', 'F', 'o', 'x', 'Fo', 'ox']
```
<?php
declare(strict_types=1);
namespace Phpml\Tokenization;
use Phpml\Exception\InvalidArgumentException;
class NGramTokenizer extends WordTokenizer
{
/**
* @var int
*/
private $minGram;
/**
* @var int
*/
private $maxGram;
public function __construct(int $minGram = 1, int $maxGram = 2)
{
if ($minGram < 1 || $maxGram < 1 || $minGram > $maxGram) {
throw new InvalidArgumentException(sprintf('Invalid (%s, %s) minGram and maxGram value combination', $minGram, $maxGram));
}
$this->minGram = $minGram;
$this->maxGram = $maxGram;
}
/**
* {@inheritdoc}
*/
public function tokenize(string $text): array
{
$words = [];
preg_match_all('/\w\w+/u', $text, $words);
$nGrams = [];
foreach ($words[0] as $word) {
$this->generateNGrams($word, $nGrams);
}
return $nGrams;
}
private function generateNGrams(string $word, array &$nGrams): void
{
$length = mb_strlen($word);
for ($j = 1; $j <= $this->maxGram; $j++) {
for ($k = 0; $k < $length - $j + 1; $k++) {
if ($j >= $this->minGram) {
$nGrams[] = mb_substr($word, $k, $j);
}
}
}
}
}
<?php
declare(strict_types=1);
namespace Phpml\Tests\Performance\Tokenization;
use PhpBench\Benchmark\Metadata\Annotations\Iterations;
use PhpBench\Benchmark\Metadata\Annotations\Revs;
use Phpml\Tokenization\NGramTokenizer;
final class NGramTokenizerBench
{
/**
* @Revs(1000)
* @Iterations(5)
*/
public function benchSimpleTokenizer(): void
{
$tokenizer = new NGramTokenizer(2, 3);
$tokenizer->tokenize(
'Lorem ipsum dolor sit amet, consectetur adipiscing elit. Praesent placerat blandit cursus. Suspendisse sed
turpis sit amet enim viverra sodales a euismod est. Ut vitae tincidunt est. Proin venenatis placerat nunc
sed ornare. Etiam feugiat, nisl nec sollicitudin sodales, nulla massa sollicitudin ipsum, vitae cursus ante
velit vitae arcu. Vestibulum feugiat ultricies hendrerit. Morbi sed varius metus. Nam feugiat maximus
turpis, a sollicitudin ligula porttitor eu.Fusce hendrerit tellus et dignissim sagittis. Nulla consectetur
condimentum tortor, non bibendum erat lacinia eget. Integer vitae maximus tortor. Vestibulum ante ipsum
primis in faucibus orci luctus et ultrices posuere cubilia Curae; Pellentesque suscipit sem ipsum, in
tincidunt risus pellentesque vel. Nullam hendrerit consequat leo, in suscipit lectus euismod non. Cras arcu
lacus, lacinia semper mauris vel, pharetra dignissim velit. Nam lacinia turpis a nibh bibendum, et
placerat tellus accumsan. Sed tincidunt cursus nisi in laoreet. Suspendisse amet.'
);
}
}
<?php
declare(strict_types=1);
namespace Phpml\Tests\Tokenization;
use Phpml\Exception\InvalidArgumentException;
use Phpml\Tokenization\NGramTokenizer;
/**
* Inspiration: https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-ngram-tokenizer.html
*/
class NGramTokenizerTest extends TokenizerTest
{
/**
* @dataProvider textDataProvider
*/
public function testNGramTokenization(int $minGram, int $maxGram, string $text, array $tokens): void
{
$tokenizer = new NGramTokenizer($minGram, $maxGram);
self::assertEquals($tokens, $tokenizer->tokenize($text));
}
public function testMinGramGreaterThanMaxGramNotAllowed(): void
{
self::expectException(InvalidArgumentException::class);
new NGramTokenizer(5, 2);
}
public function testMinGramValueTooSmall(): void
{
self::expectException(InvalidArgumentException::class);
new NGramTokenizer(0, 2);
}
public function testMaxGramValueTooSmall(): void
{
self::expectException(InvalidArgumentException::class);
new NGramTokenizer(1, 0);
}
public function textDataProvider(): array
{
return [
[
1, 2,
'Quick Fox',
['Q', 'u', 'i', 'c', 'k', 'Qu', 'ui', 'ic', 'ck', 'F', 'o', 'x', 'Fo', 'ox'],
],
[
3, 3,
'Quick Foxes',
['Qui', 'uic', 'ick', 'Fox', 'oxe', 'xes'],
],
[
1, 2,
'快狐跑过 边缘跑',
['快', '狐', '跑', '过', '快狐', '狐跑', '跑过', '边', '缘', '跑', '边缘', '缘跑'],
],
[
3, 3,
'快狐跑过狐 边缘跑狐狐',
['快狐跑', '狐跑过', '跑过狐', '边缘跑', '缘跑狐', '跑狐狐'],
],
[
2, 4,
$this->getSimpleText(),
[
'Lo', 'or', 're', 'em', 'Lor', 'ore', 'rem', 'Lore', 'orem', 'ip', 'ps', 'su', 'um', 'ips', 'psu', 'sum', 'ipsu',
'psum', 'do', 'ol', 'lo', 'or', 'dol', 'olo', 'lor', 'dolo', 'olor', 'si', 'it', 'sit', 'am', 'me', 'et', 'ame',
'met', 'amet', 'co', 'on', 'ns', 'se', 'ec', 'ct', 'te', 'et', 'tu', 'ur', 'con', 'ons', 'nse', 'sec', 'ect', 'cte',
'tet', 'etu', 'tur', 'cons', 'onse', 'nsec', 'sect', 'ecte', 'ctet', 'tetu', 'etur', 'ad', 'di', 'ip', 'pi', 'is',
'sc', 'ci', 'in', 'ng', 'adi', 'dip', 'ipi', 'pis', 'isc', 'sci', 'cin', 'ing', 'adip', 'dipi', 'ipis', 'pisc',
'isci', 'scin', 'cing', 'el', 'li', 'it', 'eli', 'lit', 'elit', 'Cr', 'ra', 'as', 'Cra', 'ras', 'Cras', 'co', 'on',
'ns', 'se', 'ec', 'ct', 'te', 'et', 'tu', 'ur', 'con', 'ons', 'nse', 'sec', 'ect', 'cte', 'tet', 'etu', 'tur',
'cons', 'onse', 'nsec', 'sect', 'ecte', 'ctet', 'tetu', 'etur', 'du', 'ui', 'dui', 'et', 'lo', 'ob', 'bo', 'or',
'rt', 'ti', 'is', 'lob', 'obo', 'bor', 'ort', 'rti', 'tis', 'lobo', 'obor', 'bort', 'orti', 'rtis', 'au', 'uc',
'ct', 'to', 'or', 'auc', 'uct', 'cto', 'tor', 'auct', 'ucto', 'ctor', 'Nu', 'ul', 'll', 'la', 'Nul', 'ull', 'lla',
'Null', 'ulla', 'vi', 'it', 'ta', 'ae', 'vit', 'ita', 'tae', 'vita', 'itae', 'co', 'on', 'ng', 'gu', 'ue', 'con',
'ong', 'ngu', 'gue', 'cong', 'ongu', 'ngue', 'lo', 'or', 're', 'em', 'lor', 'ore', 'rem', 'lore', 'orem',
],
],
[
2, 4,
$this->getUtf8Text(),
[
'鋍鞎', '鞮鞢', '鞢騉', '鞮鞢騉', '袟袘', '袘觕', '袟袘觕', '炟砏', '謺貙', '貙蹖', '謺貙蹖', '偢偣', '偣唲',
'偢偣唲', '箷箯', '箯緷', '箷箯緷', '鑴鱱', '鱱爧', '鑴鱱爧', '覮轀', '剆坲', '煘煓', '煓瑐', '煘煓瑐', '鬐鶤',
'鶤鶐', '鬐鶤鶐', '飹勫', '勫嫢', '飹勫嫢', '枲柊', '柊氠', '枲柊氠', '鍎鞚', '鞚韕', '鍎鞚韕', '焲犈', '殍涾',
'涾烰', '殍涾烰', '齞齝', '齝囃', '齞齝囃', '蹅輶', '孻憵', '擙樲', '樲橚', '擙樲橚', '藒襓', '襓謥', '藒襓謥',
'岯岪', '岪弨', '岯岪弨', '廞徲', '孻憵', '憵懥', '孻憵懥', '趡趛', '趛踠', '趡趛踠',
],
],
];
}
}
<?php
declare(strict_types=1);
namespace Phpml\Tests\Tokenization;
use PHPUnit\Framework\TestCase;
abstract class TokenizerTest extends TestCase
{
public function getSimpleText(): string
{
return 'Lorem ipsum-dolor sit amet, consectetur/adipiscing elit.
Cras consectetur, dui et lobortis;auctor.
Nulla vitae ,.,/ congue lorem.';
}
public function getUtf8Text(): string
{
return '鋍鞎 鳼 鞮鞢騉 袟袘觕, 炟砏 蒮 謺貙蹖 偢偣唲 蒛 箷箯緷 鑴鱱爧 覮轀,
剆坲 煘煓瑐 鬐鶤鶐 飹勫嫢 銪 餀 枲柊氠 鍎鞚韕 焲犈,
殍涾烰 齞齝囃 蹅輶 鄜, 孻憵 擙樲橚 藒襓謥 岯岪弨 蒮 廞徲 孻憵懥 趡趛踠 槏';
}
}
......@@ -5,37 +5,28 @@ declare(strict_types=1);
namespace Phpml\Tests\Tokenization;
use Phpml\Tokenization\WhitespaceTokenizer;
use PHPUnit\Framework\TestCase;
class WhitespaceTokenizerTest extends TestCase
class WhitespaceTokenizerTest extends TokenizerTest
{
public function testTokenizationOnAscii(): void
{
$tokenizer = new WhitespaceTokenizer();
$text = 'Lorem ipsum dolor sit amet, consectetur adipiscing elit.
Cras consectetur, dui et lobortis auctor.
Nulla vitae congue lorem.';
$tokens = ['Lorem', 'ipsum-dolor', 'sit', 'amet,', 'consectetur/adipiscing', 'elit.',
'Cras', 'consectetur,', 'dui', 'et', 'lobortis;auctor.',
'Nulla', 'vitae', ',.,/', 'congue', 'lorem.', ];
$tokens = ['Lorem', 'ipsum', 'dolor', 'sit', 'amet,', 'consectetur', 'adipiscing', 'elit.',
'Cras', 'consectetur,', 'dui', 'et', 'lobortis', 'auctor.',
'Nulla', 'vitae', 'congue', 'lorem.', ];
self::assertEquals($tokens, $tokenizer->tokenize($text));
self::assertEquals($tokens, $tokenizer->tokenize($this->getSimpleText()));
}
public function testTokenizationOnUtf8(): void
{
$tokenizer = new WhitespaceTokenizer();
$text = '鋍鞎 鳼 鞮鞢騉 袟袘觕, 炟砏 蒮 謺貙蹖 偢偣唲 蒛 箷箯緷 鑴鱱爧 覮轀,
剆坲 煘煓瑐 鬐鶤鶐 飹勫嫢 銪 餀 枲柊氠 鍎鞚韕 焲犈,
殍涾烰 齞齝囃 蹅輶 鄜, 孻憵 擙樲橚 藒襓謥 岯岪弨 蒮 廞徲 孻憵懥 趡趛踠 槏';
$tokens = ['鋍鞎', '鳼', '鞮鞢騉', '袟袘觕,', '炟砏', '蒮', '謺貙蹖', '偢偣唲', '蒛', '箷箯緷', '鑴鱱爧', '覮轀,',
'剆坲', '煘煓瑐', '鬐鶤鶐', '飹勫嫢', '銪', '餀', '枲柊氠', '鍎鞚韕', '焲犈,',
'殍涾烰', '齞齝囃', '蹅輶', '鄜,', '孻憵', '擙樲橚', '藒襓謥', '岯岪弨', '蒮', '廞徲', '孻憵懥', '趡趛踠', '槏', ];
self::assertEquals($tokens, $tokenizer->tokenize($text));
self::assertEquals($tokens, $tokenizer->tokenize($this->getUtf8Text()));
}
}
......@@ -5,37 +5,28 @@ declare(strict_types=1);
namespace Phpml\Tests\Tokenization;
use Phpml\Tokenization\WordTokenizer;
use PHPUnit\Framework\TestCase;
class WordTokenizerTest extends TestCase
class WordTokenizerTest extends TokenizerTest
{
public function testTokenizationOnAscii(): void
{
$tokenizer = new WordTokenizer();
$text = 'Lorem ipsum-dolor sit amet, consectetur/adipiscing elit.
Cras consectetur, dui et lobortis;auctor.
Nulla vitae ,.,/ congue lorem.';
$tokens = ['Lorem', 'ipsum', 'dolor', 'sit', 'amet', 'consectetur', 'adipiscing', 'elit',
'Cras', 'consectetur', 'dui', 'et', 'lobortis', 'auctor',
'Nulla', 'vitae', 'congue', 'lorem', ];
self::assertEquals($tokens, $tokenizer->tokenize($text));
self::assertEquals($tokens, $tokenizer->tokenize($this->getSimpleText()));
}
public function testTokenizationOnUtf8(): void
{
$tokenizer = new WordTokenizer();
$text = '鋍鞎 鳼 鞮鞢騉 袟袘觕, 炟砏 蒮 謺貙蹖 偢偣唲 蒛 箷箯緷 鑴鱱爧 覮轀,
剆坲 煘煓瑐 鬐鶤鶐 飹勫嫢 銪 餀 枲柊氠 鍎鞚韕 焲犈,
殍涾烰 齞齝囃 蹅輶 鄜, 孻憵 擙樲橚 藒襓謥 岯岪弨 蒮 廞徲 孻憵懥 趡趛踠 槏';
$tokens = ['鋍鞎', '鞮鞢騉', '袟袘觕', '炟砏', '謺貙蹖', '偢偣唲', '箷箯緷', '鑴鱱爧', '覮轀',
'剆坲', '煘煓瑐', '鬐鶤鶐', '飹勫嫢', '枲柊氠', '鍎鞚韕', '焲犈',
'殍涾烰', '齞齝囃', '蹅輶', '孻憵', '擙樲橚', '藒襓謥', '岯岪弨', '廞徲', '孻憵懥', '趡趛踠', ];
self::assertEquals($tokens, $tokenizer->tokenize($text));
self::assertEquals($tokens, $tokenizer->tokenize($this->getUtf8Text()));
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment