From d34e1151afbd9d69b26212d60a4b7439b2435b91 Mon Sep 17 00:00:00 2001 From: itismadness Date: Sun, 24 May 2020 06:01:58 +0100 Subject: [PATCH] improve handling of unknown non-UTF8 EAC logs --- src/Parser/EAC/Translator.php | 1 + src/Util.php | 6 ++ tests/Parser/EAC/TranslatorTest.php | 30 ++++++ tests/logs/eac/originals/en_7.log | Bin 0 -> 24662 bytes tests/logs/eac/originals/en_8.log | 194 ++++++++++++++++++++++++++++++++++++ 5 files changed, 231 insertions(+) create mode 100644 tests/logs/eac/originals/en_7.log create mode 100644 tests/logs/eac/originals/en_8.log diff --git a/src/Parser/EAC/Translator.php b/src/Parser/EAC/Translator.php index e78d908..45f2dd2 100644 --- a/src/Parser/EAC/Translator.php +++ b/src/Parser/EAC/Translator.php @@ -54,6 +54,7 @@ class Translator file_get_contents(__DIR__ . DIRECTORY_SEPARATOR . 'languages' . DIRECTORY_SEPARATOR . 'master.json'), true ); + foreach ($languages as $code => $lang) { foreach ($lang['eac_strings'] as $eac_string) { if (preg_match('/' . preg_quote($eac_string, "/") . '/ui', $log) === 1) { diff --git a/src/Util.php b/src/Util.php index 5b36730..ccc5a84 100644 --- a/src/Util.php +++ b/src/Util.php @@ -41,6 +41,12 @@ class Util if ($results['charset'] !== 'utf-8' && $results['confidence'] > 0.7) { // $log = mb_convert_encoding($log, 'UTF-8', $results['charset']); $log = iconv($results['charset'], 'UTF-8', $log); + } elseif ($results['charset'] !== 'utf-8' && $results['confidence'] > 0.3) { + // If we've got a poor confidence on our decoding, we just use a generic + // ISO-8859-1 as that covers a decent range of things that people would + // inadvertently re-encode a log into. I seriously cannot express how + // much I hate how EAC does not use always UTF-8. + $log = iconv('ISO-8859-1', 'UTF-8', $log); } } return $log; diff --git a/tests/Parser/EAC/TranslatorTest.php b/tests/Parser/EAC/TranslatorTest.php index a1cea0e..e587288 100644 --- a/tests/Parser/EAC/TranslatorTest.php +++ b/tests/Parser/EAC/TranslatorTest.php @@ -6,6 +6,7 @@ namespace OrpheusNET\Logchecker\Parser\EAC; use FilesystemIterator; use OrpheusNET\Logchecker\Exception\UnknownLanguageException; +use OrpheusNET\Logchecker\Util; use PHPUnit\Framework\TestCase; class TranslatorTest extends TestCase @@ -51,4 +52,33 @@ class TranslatorTest extends TestCase ) ); } + + public function englishLogProvider() + { + return array_map( + function ($file) { + return [$file]; + }, + array_filter( + scandir(implode(DIRECTORY_SEPARATOR, [__DIR__, '..', '..', 'logs', 'eac', 'originals'])), + function ($file) { + return substr($file, 0, 2) === 'en'; + } + ) + ); + } + + /** + * @dataProvider englishLogProvider + */ + public function testEnglishLanguage(string $file) + { + $logPath = implode(DIRECTORY_SEPARATOR, [__DIR__, '..', '..', 'logs', 'eac', 'originals', $file]); + $log = file_get_contents($logPath); + $log = Util::decodeEncoding($log, $logPath); + $langDetails = Translator::getLanguage($log); + $this->assertSame('en', $langDetails['code']); + $this->assertSame('English', $langDetails['name']); + $this->assertSame('English', $langDetails['name_english']); + } } diff --git a/tests/logs/eac/originals/en_7.log b/tests/logs/eac/originals/en_7.log new file mode 100644 index 0000000000000000000000000000000000000000..6e51a1bdf912da9d9912284422a55ef516abef12 GIT binary patch literal 24662 zcmeI4U2hx56^8e^K>q^~K#c-a3Mq;DD3A+^5)-$M<;t?tAnAoH$%w74j) zbl=zayW)fHUKE{TwP@?^WpSo-$GThB=T&j7G?%(tD|VE^`Oj6xO)70oc{;j&U;Hk8 z_Emmgsm?_4R(VYoYH#)QjegHn=7sLg^uJtsJC*)tmHZ}g#V&n8mP5Aw>-cWZm zNb2LHU&{Da|87$5J>9)Z@4(El$_>mQ>679Iy(@Dvocn8~d7V;#6H;EOH1M{j?|1rl zAzi@5wccBzPxLgjo9j=$SE&s6I8mDG#4}h9RF(Odeh-%$#de}|ko4)&ENk-wLLQ4V zo~;y*5^j(64c7AW14#sq#xH0@T?XRujjonbzti`n{@>_=(z*T**X~9%FO`nAfmD7{ zx?4#aO}WLql?0IHu(Z(?d8}3L;5}#r?{1Bh6K`^<)a0e!T&eumi8l2r*W`(6;F^qb zX$>PC%0!HoQ z7hhM>3Gxg(`})59WI2-NR(Xt8*XlCqW0Yq3KlEM3weD;Skc0Wy*BvbpzOEBZY!A4j zRXWW}@n^n_%5v%B_t5J|rr67t5V)Iif3Dgg<-dx1utbUURkL1{{*`i{OHY*76X)O+ zySuJmdOKRhkzFOM%#Z$efccjmxzqReK zC0JH(x?OClZ~9)(AFBp^-Mx}bXdA1d@UuR-Zb-`35xE5~lenb+kF;P3epW<_?-%JC z+79~PN8)imrK9ZIgpJ`;J*C(c#=gEQ?Tzfu zkE-{_((j3bqq{OcsIJf-F7ZelBA35NHYk@QmLLOow+?%$)q_O zDf?+EYoK2LP$fsX$ktqE3u1kb%A~KHUrWOWl3(Nwd|U||v?JGMDcO=6mHa`^E~MQX zy;;#^l&(nY_Q<$%ukeLr`K>s6q}o4IjXzP2a{fpMg~{b-?Pa};YdmWEaoz7Fg!IQW zfM@kK=@;)Z>Y5RAAl*Va@>7c?)%YsanqK5I$zd30kt=K7`AjKWMI>0xX>FiKtU8pe zi_4__(A`pMv-nf=K1&JV*xx1bLaQVlEiOJs(r#bx?di~OV+Dr&VB5yN>KVDRMK}^A zTJvk=`lmiz&rLXTVx4ZXGBfo!1?aO(%XZ+BXRik&|nnWGXbUGcVugeOr zsvJ`1@-qdv@|EAxt8FXC=G~h~)ai$!#BQSF@mQu~>PVzBMm;EO-BE~=C=-sUJ2stb z5_QYP9B(JdL%oya<)@t_`E5Nd>%%1mg`GPJW$GI$V@Etpqi&g*&Q7AkNV80xy07WU zs_K#@V(LL*67{Z7($zRDv@)sg)(X^-YtUg-8}(r9$vaSogw3g`2ZhjW`R!Jcm9_OT z$y;NlZoRK3I>^g(;He||P11W%FnXYodD|9s_??D%YsJ*JM5ilCtNI>paiC{a^&gdd zPzdP`Q{R!^ccxQ%96p(8WqzdpKTo)BCdl~TTe2{ z)VqnN(e|DxOp-kKB~In~O|o^iD&}}yG_mMgNyfu1&ZsNv!*ve|A>DG_v9n-4IGZH- zZ0qR3sQYMtVjDysx2dvn`JgT+ME&yHj%tCGAO)OGu6wq1=n1<5y0PC;5)z>-FrOvj zx(9`*U*v%^C~xXJmZiOK`g(9zHI4w^Y!L#sl+Z8B=$oVYHXY6vCo}yog3{b;WUTHZFN0 z7I0?jSXx>z{YkbfPzMv(`HTi~WeVeV9?MDp8`g9Z>fk=8cU2me7OPf9o$C(f$EgQ} zan#W|H&eu!t;2If#%Q=~S6p}7d8B9?h%5S#>kpncVmM3U>A@)79IIVNv=CjHb(5#(a#B zh`{8?ioL&;^oCKm=M0FSUuk5`+z59!#lKabpY;Fz1S-UH&NQkz*ZX(ML42${Zi^8P zS_z9c&g)a1)&GcPyd;Y=6fK&XL?zm4Q9OD<+jBIY)Y z;*rW_Je*^{9pM81XG`3)6^R+}HIrTdi;ZZl|aeMifJ6`_r2bFRw-*K%iur}tI&H{1L!H9=X?*rE*1~$K1X zi&y-qJ)sSMxm^pdyD9-spD60S>OLR5T5iL#dk9|XN5SindIY=zM}6rTTC%g?_To3( zCi#r|2)n{-SM7XVsO{)?SoY&~wkmzIxZO_uJE&vazFWi2vi4zLsHfI#ea{EC-I_iE zeJHqPy^U9Dyp)&p^TJ}4k#Tc27Qc`&Gm!)APZd_H(l?7$VqCN-M%bIFM`sPyEdK1S zax<>qQf>QcoAbe{Y9ERl{ zB>Yp`UmHo-u8CfHReQH*X&o)=O@2cv8X1py?MUP3fA7xv4EJ;6cD5>gvq)w1v?mL> zFFVP+P93B&Co_;=LW*n__I~% zo5dV#W#Z z0EpQ&=f65$#JHQeKHG=s9su#BkEd@MtKH;js*Ph{iI%wUJ7g2UhFNgL$0_ zSbZ$4dgZ*=!#~x3)+MsqktbIRYiXN}&hE-;VS6^z+Y<2{wie@FK~?%@(Mw-IEO}F0 z6T_I@(zX-o=`YswnNQ>Zn_th;=9w_#G|we4v#_YXL9_E;j!(8_c^vPmLTpv~W)aIu zfSrUk@K$I33p)+o`NkWbk8ecJQ+eO7O zv61Hd26|w8MrK9v2C8semA+XV6RBd{zpihPIJ@O#34p?^Ok z#_A3Bf-NdqoAwh1(mZ`Hk$Zb<)4W{z=V|GiMJth6`{T5*IwLY7dZ2{aKVoI`vAXqU z!&rS}s~c!#Ug1)<*Q>o26|v3Ei7`90C+(v_R{vJn*Q)f*B9<9fk0et3qy7 z`eu>4BMuo&!7sDWvwH?uS}xBifc<$y6za{0v1-`J{Zgo9_Lg1Jiyc8A_S~EwOF!>5 zC-nE#cD5>g(`d!IG858IsGS|HtOO@s;uT`xeI977Hv>i_E~Ax**SWCD_-k1sFptn! zm2jS(aq7wWT?*DCe72CjZu9mRe6Si}J&jiSmG-wPeY1$}h+B4IdHosVs##aluo{7R z2Ugf}Hy=dTn<-*);C~BGSM)U|^UPHCY1!E|5H^POFg{OFmA+{V z_mfSwS6gSt4s#q`c_<#0&L`^2N{D(GW(6P^{wA$jxY3H-5BlemuYAVKqQ)R*jq%Qc zS@tm!GsPOQ*Q^S&Rq30>tY_93|1e|GtKmnGk5$Nw1F+xo@FSRIu7}x}TjM+$uf&ar znfE??sOXFSEX+j!K_K}4hGhS69Jpm6UN`5 z=U`@WUUO}DQ}qxWb8U>^d z-z;jGQDcWAeFf;99kq;8Hmw3Kt{!S*#~D7EccgnyG^cYe<|-WQY}2!A ztQ@1~WIs!<(#|s0pO(H^#Io~Ypz#p>0y`yVM=a-&5Qnq9ozGk_r%}{HY|O6pGzRk- zd!K4thQ(#KBu^I6yKNq|#C`k10{b7Zp;dOaDt)u4g*v--SOLbUYIfAJa|geGIYnAP zUH=w2v9`8+GA?*yu7OjGIRj(1QzR(y+mxF#oR8D!j5B||A~HM%PO2vIz!?*_qLI&j zCGx-tvvGE3V4u?>iMkP|_)PCfm)~(t10FQ9(Zn>O^|@|PV(*Yo(2m_jKF|8}K5d-+ z=u;_K|3;ZUHyBPii<@ ORu$npweGou