Commit cefb4fc7 authored by KenorFR's avatar KenorFR Committed by Arkadiusz Kondas
Browse files

Ngram word (#370)

* Add NGramWordTokenizer

* Update doc
Add test
Check coding standards
parent dbbce0e0
......@@ -71,3 +71,20 @@ $tokenizer->tokenize('Quick Fox');
// returns ['Q', 'u', 'i', 'c', 'k', 'Qu', 'ui', 'ic', 'ck', 'F', 'o', 'x', 'Fo', 'ox']
```
**NGramWordTokenizer**
The NGramWordTokenizer tokenizer accepts the following parameters:
`$minGram` - minimum length of characters in a gram. Defaults to 1.
`$maxGram` - maximum length of characters in a gram. Defaults to 2.
```php
use Phpml\Tokenization\NGramWordTokenizer;
$tokenizer = new NGramWordTokenizer(1, 2);
$tokenizer->tokenize('very quick fox');
// returns ['very', 'quick', 'fox', 'very quick', 'quick fox']
```
<?php
declare(strict_types=1);
namespace Phpml\Tokenization;
use Phpml\Exception\InvalidArgumentException;
class NGramWordTokenizer extends WordTokenizer
{
/**
* @var int
*/
private $minGram;
/**
* @var int
*/
private $maxGram;
public function __construct(int $minGram = 1, int $maxGram = 2)
{
if ($minGram < 1 || $maxGram < 1 || $minGram > $maxGram) {
throw new InvalidArgumentException(sprintf('Invalid (%s, %s) minGram and maxGram value combination', $minGram, $maxGram));
}
$this->minGram = $minGram;
$this->maxGram = $maxGram;
}
/**
* {@inheritdoc}
*/
public function tokenize(string $text): array
{
preg_match_all('/\w\w+/u', $text, $words);
$words = $words[0];
$nGrams = [];
for ($j = $this->minGram; $j <= $this->maxGram; $j++) {
$nGrams = array_merge($nGrams, $this->getNgrams($words, $j));
}
return $nGrams;
}
private function getNgrams(array $match, int $n = 2): array
{
$ngrams = [];
$len = count($match);
for ($i = 0; $i < $len; $i++) {
if ($i > ($n - 2)) {
$ng = '';
for ($j = $n - 1; $j >= 0; $j--) {
$ng .= ' '.$match[$i - $j];
}
$ngrams[] = trim($ng);
}
}
return $ngrams;
}
}
<?php
declare(strict_types=1);
namespace Phpml\Tests\Tokenization;
use Phpml\Exception\InvalidArgumentException;
use Phpml\Tokenization\NGramWordTokenizer;
/**
* Inspiration: https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-ngram-tokenizer.html
*/
class NGramWordTokenizerTest extends TokenizerTest
{
/**
* @dataProvider textDataProvider
*/
public function testNGramTokenization(int $minGram, int $maxGram, string $text, array $tokens): void
{
$tokenizer = new NGramWordTokenizer($minGram, $maxGram);
self::assertEquals($tokens, $tokenizer->tokenize($text));
}
public function testMinGramGreaterThanMaxGramNotAllowed(): void
{
self::expectException(InvalidArgumentException::class);
new NGramWordTokenizer(5, 2);
}
public function testMinGramValueTooSmall(): void
{
self::expectException(InvalidArgumentException::class);
new NGramWordTokenizer(0, 2);
}
public function testMaxGramValueTooSmall(): void
{
self::expectException(InvalidArgumentException::class);
new NGramWordTokenizer(1, 0);
}
public function textDataProvider(): array
{
return [
[
1, 1,
'one two three four',
['one', 'two', 'three', 'four'],
],
[
1, 2,
'one two three four',
['one', 'two', 'three', 'four', 'one two', 'two three', 'three four'],
],
[
1, 3,
'one two three four',
['one', 'two', 'three', 'four', 'one two', 'two three', 'three four', 'one two three', 'two three four'],
],
[
2, 3,
'one two three four',
['one two', 'two three', 'three four', 'one two three', 'two three four'],
],
[
1, 2,
'快狐跑过 边缘跑',
['快狐跑过', '边缘跑', '快狐跑过 边缘跑'],
],
[
2, 4,
$this->getSimpleText(),
[
'Lorem ipsum', 'ipsum dolor', 'dolor sit', 'sit amet', 'amet consectetur', 'consectetur adipiscing',
'adipiscing elit', 'elit Cras', 'Cras consectetur', 'consectetur dui', 'dui et', 'et lobortis',
'lobortis auctor', 'auctor Nulla', 'Nulla vitae', 'vitae congue', 'congue lorem', 'Lorem ipsum dolor',
'ipsum dolor sit', 'dolor sit amet', 'sit amet consectetur', 'amet consectetur adipiscing',
'consectetur adipiscing elit', 'adipiscing elit Cras', 'elit Cras consectetur', 'Cras consectetur dui',
'consectetur dui et', 'dui et lobortis', 'et lobortis auctor', 'lobortis auctor Nulla', 'auctor Nulla vitae',
'Nulla vitae congue', 'vitae congue lorem', 'Lorem ipsum dolor sit', 'ipsum dolor sit amet',
'dolor sit amet consectetur', 'sit amet consectetur adipiscing', 'amet consectetur adipiscing elit',
'consectetur adipiscing elit Cras', 'adipiscing elit Cras consectetur', 'elit Cras consectetur dui',
'Cras consectetur dui et', 'consectetur dui et lobortis', 'dui et lobortis auctor', 'et lobortis auctor Nulla',
'lobortis auctor Nulla vitae', 'auctor Nulla vitae congue', 'Nulla vitae congue lorem',
],
],
[
2, 4,
$this->getUtf8Text(),
[
'鋍鞎 鞮鞢騉', '鞮鞢騉 袟袘觕', '袟袘觕 炟砏', '炟砏 謺貙蹖', '謺貙蹖 偢偣唲', '偢偣唲 箷箯緷', '箷箯緷 鑴鱱爧', '鑴鱱爧 覮轀',
'覮轀 剆坲', '剆坲 煘煓瑐', '煘煓瑐 鬐鶤鶐', '鬐鶤鶐 飹勫嫢', '飹勫嫢 枲柊氠', '枲柊氠 鍎鞚韕', '鍎鞚韕 焲犈', '焲犈 殍涾烰',
'殍涾烰 齞齝囃', '齞齝囃 蹅輶', '蹅輶 孻憵', '孻憵 擙樲橚', '擙樲橚 藒襓謥', '藒襓謥 岯岪弨', '岯岪弨 廞徲', '廞徲 孻憵懥',
'孻憵懥 趡趛踠', '鋍鞎 鞮鞢騉 袟袘觕', '鞮鞢騉 袟袘觕 炟砏', '袟袘觕 炟砏 謺貙蹖', '炟砏 謺貙蹖 偢偣唲', '謺貙蹖 偢偣唲 箷箯緷',
'偢偣唲 箷箯緷 鑴鱱爧', '箷箯緷 鑴鱱爧 覮轀', '鑴鱱爧 覮轀 剆坲', '覮轀 剆坲 煘煓瑐', '剆坲 煘煓瑐 鬐鶤鶐', '煘煓瑐 鬐鶤鶐 飹勫嫢',
'鬐鶤鶐 飹勫嫢 枲柊氠', '飹勫嫢 枲柊氠 鍎鞚韕', '枲柊氠 鍎鞚韕 焲犈', '鍎鞚韕 焲犈 殍涾烰', '焲犈 殍涾烰 齞齝囃', '殍涾烰 齞齝囃 蹅輶',
'齞齝囃 蹅輶 孻憵', '蹅輶 孻憵 擙樲橚', '孻憵 擙樲橚 藒襓謥', '擙樲橚 藒襓謥 岯岪弨', '藒襓謥 岯岪弨 廞徲', '岯岪弨 廞徲 孻憵懥',
'廞徲 孻憵懥 趡趛踠', '鋍鞎 鞮鞢騉 袟袘觕 炟砏', '鞮鞢騉 袟袘觕 炟砏 謺貙蹖', '袟袘觕 炟砏 謺貙蹖 偢偣唲', '炟砏 謺貙蹖 偢偣唲 箷箯緷',
'謺貙蹖 偢偣唲 箷箯緷 鑴鱱爧', '偢偣唲 箷箯緷 鑴鱱爧 覮轀', '箷箯緷 鑴鱱爧 覮轀 剆坲', '鑴鱱爧 覮轀 剆坲 煘煓瑐',
'覮轀 剆坲 煘煓瑐 鬐鶤鶐', '剆坲 煘煓瑐 鬐鶤鶐 飹勫嫢', '煘煓瑐 鬐鶤鶐 飹勫嫢 枲柊氠', '鬐鶤鶐 飹勫嫢 枲柊氠 鍎鞚韕',
'飹勫嫢 枲柊氠 鍎鞚韕 焲犈', '枲柊氠 鍎鞚韕 焲犈 殍涾烰', '鍎鞚韕 焲犈 殍涾烰 齞齝囃', '焲犈 殍涾烰 齞齝囃 蹅輶',
'殍涾烰 齞齝囃 蹅輶 孻憵', '齞齝囃 蹅輶 孻憵 擙樲橚', '蹅輶 孻憵 擙樲橚 藒襓謥', '孻憵 擙樲橚 藒襓謥 岯岪弨', '擙樲橚 藒襓謥 岯岪弨 廞徲',
'藒襓謥 岯岪弨 廞徲 孻憵懥', '岯岪弨 廞徲 孻憵懥 趡趛踠',
],
],
];
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment