Additional chars that contains to words and do not start a new word.

* * @psalm-pure * * @return int|string[] *

The number of words in the string.

* * @phpstan-param 0|1|2 $format * @phpstan-return ($format is 2 ? array : ($format is 1 ? list : 0|positive-int)) */ public static function str_word_count(string $str, int $format = 0, string $char_list = '') { $str_parts = self::str_to_words($str, $char_list); $len = \count($str_parts); if ($format === 1) { $number_of_words = []; for ($i = 1; $i < $len; $i += 2) { $number_of_words[] = $str_parts[$i]; } return $number_of_words; } if ($format === 2) { $number_of_words = []; $offset = (int) self::strlen($str_parts[0]); for ($i = 1; $i < $len; $i += 2) { $number_of_words[$offset] = $str_parts[$i]; $offset += (int) self::strlen($str_parts[$i]) + (int) self::strlen($str_parts[$i + 1]); } return $number_of_words; } $number_of_words = (int) (($len - 1) / 2); /* @phpstan-ignore-next-line | it should be 0|positive-int, maybe nested "phpstan-return" is not working? */ return $number_of_words; } /** * Case-insensitive string comparison. * * INFO: Case-insensitive version of UTF8::strcmp() * * EXAMPLE: UTF8::strcasecmp("iñtërnâtiôn\nàlizætiøn", "Iñtërnâtiôn\nàlizætiøn"); // 0 * * @param string $str1

The first string.

* @param string $str2

The second string.

* @param string $encoding [optional]

Set the charset for e.g. "mb_" function

* * @psalm-pure * * @return int * < 0 if str1 is less than str2;
* > 0 if str1 is greater than str2,
* 0 if they are equal */ public static function strcasecmp( string $str1, string $str2, string $encoding = 'UTF-8' ): int { return self::strcmp( self::strtocasefold( $str1, true, false, $encoding, null, false ), self::strtocasefold( $str2, true, false, $encoding, null, false ) ); } /** * Case-sensitive string comparison. * * EXAMPLE: UTF8::strcmp("iñtërnâtiôn\nàlizætiøn", "iñtërnâtiôn\nàlizætiøn"); // 0 * * @param string $str1

The first string.

* @param string $str2

The second string.

* * @psalm-pure * * @return int * < 0 if str1 is less than str2
* > 0 if str1 is greater than str2
* 0 if they are equal */ public static function strcmp(string $str1, string $str2): int { if ($str1 === $str2) { return 0; } return \strcmp( \Normalizer::normalize($str1, \Normalizer::NFD), \Normalizer::normalize($str2, \Normalizer::NFD) ); } /** * Find length of initial segment not matching mask. * * @param string $str * @param string $char_list * @param int $offset * @param int|null $length * @param string $encoding [optional]

Set the charset for e.g. "mb_" function

* * @psalm-pure * * @return int * * @phpstan-return 0|positive-int */ public static function strcspn( string $str, string $char_list, int $offset = 0, int $length = null, string $encoding = 'UTF-8' ): int { if ($encoding !== 'UTF-8' && $encoding !== 'CP850') { $encoding = self::normalize_encoding($encoding, 'UTF-8'); } if ($char_list === '') { return (int) self::strlen($str, $encoding); } if ($offset || $length !== null) { if ($encoding === 'UTF-8') { if ($length === null) { $str_tmp = \mb_substr($str, $offset); } else { $str_tmp = \mb_substr($str, $offset, $length); } } else { $str_tmp = self::substr($str, $offset, $length, $encoding); } if ($str_tmp === false) { return 0; } $str = $str_tmp; } if ($str === '') { return 0; } $matches = []; if (\preg_match('/^(.*?)' . self::rxClass($char_list) . '/us', $str, $matches)) { $return = self::strlen($matches[1], $encoding); if ($return === false) { return 0; } return $return; } return (int) self::strlen($str, $encoding); } /** * Create a UTF-8 string from code points. * * INFO: opposite to UTF8::codepoints() * * EXAMPLE: UTF8::string(array(246, 228, 252)); // 'öäü' * * @param int|int[]|string|string[] $intOrHex

Integer or Hexadecimal codepoints.

* * @phpstan-param int[]|numeric-string[]|int|numeric-string $intOrHex * * @psalm-pure * * @return string *

A UTF-8 encoded string.

*/ public static function string($intOrHex): string { if ($intOrHex === []) { return ''; } if (!\is_array($intOrHex)) { $intOrHex = [$intOrHex]; } $str = ''; foreach ($intOrHex as $strPart) { $str .= '&#' . (int) $strPart . ';'; } // We cannot use html_entity_decode() here, as it will not return // characters for many values < 160. return mb_convert_encoding($str, 'UTF-8', 'HTML-ENTITIES'); } /** * Checks if string starts with "BOM" (Byte Order Mark Character) character. * * EXAMPLE: UTF8::string_has_bom("\xef\xbb\xbf foobar"); // true * * @param string $str

The input string.

* * @psalm-pure * * @return bool *

* true if the string has BOM at the start,
* false otherwise *

*/ public static function string_has_bom(string $str): bool { foreach (self::$BOM as $bom_string => &$bom_byte_length) { if (\strncmp($str, $bom_string, $bom_byte_length) === 0) { return true; } } return false; } /** * Strip HTML and PHP tags from a string + clean invalid UTF-8. * * EXAMPLE: UTF8::strip_tags("κόσμε\xa0\xa1"); // 'κόσμε' * * @see http://php.net/manual/en/function.strip-tags.php * * @param string $str

* The input string. *

* @param string|null $allowable_tags [optional]

* You can use the optional second parameter to specify tags which should * not be stripped. *

* HTML comments and PHP tags are also stripped. This is hardcoded and * can not be changed with allowable_tags. *

* @param bool $clean_utf8 [optional]

Remove non UTF-8 chars from the string.

* * @psalm-pure * * @return string *

The stripped string.

*/ public static function strip_tags( string $str, string $allowable_tags = null, bool $clean_utf8 = false ): string { if ($str === '') { return ''; } if ($clean_utf8) { $str = self::clean($str); } if ($allowable_tags === null) { return \strip_tags($str); } return \strip_tags($str, $allowable_tags); } /** * Strip all whitespace characters. This includes tabs and newline * characters, as well as multibyte whitespace such as the thin space * and ideographic space. * * EXAMPLE: UTF8::strip_whitespace(' Ο συγγραφέας '); // 'Οσυγγραφέας' * * @param string $str * * @psalm-pure * * @return string */ public static function strip_whitespace(string $str): string { if ($str === '') { return ''; } return (string) \preg_replace('/[[:space:]]+/u', '', $str); } /** * Find the position of the first occurrence of a substring in a string, case-insensitive. * * INFO: use UTF8::stripos_in_byte() for the byte-length * * EXAMPLE: UTF8::stripos('aσσb', 'ΣΣ'); // 1 (σσ == ΣΣ) * * @see http://php.net/manual/en/function.mb-stripos.php * * @param string $haystack

The string from which to get the position of the first occurrence of needle.

* @param string $needle

The string to find in haystack.

* @param int $offset [optional]

The position in haystack to start searching.

* @param string $encoding [optional]

Set the charset for e.g. "mb_" function

* @param bool $clean_utf8 [optional]

Remove non UTF-8 chars from the string.

* * @psalm-pure * * @return false|int * Return the (int) numeric position of the first occurrence of needle in the * haystack string,
or false if needle is not found * * @phpstan-return false|0|positive-int */ public static function stripos( string $haystack, string $needle, int $offset = 0, string $encoding = 'UTF-8', bool $clean_utf8 = false ) { if ($haystack === '') { if (\PHP_VERSION_ID >= 80000 && $needle === '') { return 0; } return false; } if ($needle === '' && \PHP_VERSION_ID < 80000) { return false; } if ($clean_utf8) { // "mb_strpos()" and "iconv_strpos()" returns wrong position, // if invalid characters are found in $haystack before $needle $haystack = self::clean($haystack); $needle = self::clean($needle); } if (self::$SUPPORT['mbstring'] === true) { if ($encoding === 'UTF-8') { return \mb_stripos($haystack, $needle, $offset); } $encoding = self::normalize_encoding($encoding, 'UTF-8'); return \mb_stripos($haystack, $needle, $offset, $encoding); } $encoding = self::normalize_encoding($encoding, 'UTF-8'); if ( $encoding === 'UTF-8' // INFO: "grapheme_stripos()" can't handle other encodings && $offset >= 0 // grapheme_stripos() can't handle negative offset && self::$SUPPORT['intl'] === true ) { $return_tmp = \grapheme_stripos($haystack, $needle, $offset); if ($return_tmp !== false) { return $return_tmp; } } // // fallback for ascii only // if (ASCII::is_ascii($haystack . $needle)) { return \stripos($haystack, $needle, $offset); } // // fallback via vanilla php // $haystack = self::strtocasefold($haystack, true, false, $encoding, null, false); $needle = self::strtocasefold($needle, true, false, $encoding, null, false); return self::strpos($haystack, $needle, $offset, $encoding); } /** * Returns all of haystack starting from and including the first occurrence of needle to the end. * * EXAMPLE:


     * $str = 'iñtërnâtiônàlizætiøn';
     * $search = 'NÂT';
     *
     * UTF8::stristr($str, $search)); // 'nâtiônàlizætiøn'
     * UTF8::stristr($str, $search, true)); // 'iñtër'
     *

* * @param string $haystack

The input string. Must be valid UTF-8.

* @param string $needle

The string to look for. Must be valid UTF-8.

* @param bool $before_needle [optional]

* If TRUE, it returns the part of the * haystack before the first occurrence of the needle (excluding the needle). *

* @param string $encoding [optional]

Set the charset for e.g. "mb_" function

* @param bool $clean_utf8 [optional]

Remove non UTF-8 chars from the string.

* * @psalm-pure * * @return false|string *

A sub-string,
or false if needle is not found.

*/ public static function stristr( string $haystack, string $needle, bool $before_needle = false, string $encoding = 'UTF-8', bool $clean_utf8 = false ) { if ($haystack === '') { if (\PHP_VERSION_ID >= 80000 && $needle === '') { return ''; } return false; } if ($clean_utf8) { // "mb_strpos()" and "iconv_strpos()" returns wrong position, // if invalid characters are found in $haystack before $needle $needle = self::clean($needle); $haystack = self::clean($haystack); } if ($needle === '') { if (\PHP_VERSION_ID >= 80000) { return $haystack; } return false; } if (self::$SUPPORT['mbstring'] === true) { if ($encoding === 'UTF-8') { return \mb_stristr($haystack, $needle, $before_needle); } $encoding = self::normalize_encoding($encoding, 'UTF-8'); return \mb_stristr($haystack, $needle, $before_needle, $encoding); } $encoding = self::normalize_encoding($encoding, 'UTF-8'); if ( $encoding !== 'UTF-8' && self::$SUPPORT['mbstring'] === false ) { /** * @psalm-suppress ImpureFunctionCall - this is only a warning */ \trigger_error('UTF8::stristr() without mbstring cannot handle "' . $encoding . '" encoding', \E_USER_WARNING); } if ( $encoding === 'UTF-8' // INFO: "grapheme_stristr()" can't handle other encodings && self::$SUPPORT['intl'] === true ) { $return_tmp = \grapheme_stristr($haystack, $needle, $before_needle); if ($return_tmp !== false) { return $return_tmp; } } if (ASCII::is_ascii($needle . $haystack)) { return \stristr($haystack, $needle, $before_needle); } \preg_match('/^(.*?)' . \preg_quote($needle, '/') . '/usi', $haystack, $match); if (!isset($match[1])) { return false; } if ($before_needle) { return $match[1]; } return self::substr($haystack, (int) self::strlen($match[1], $encoding), null, $encoding); } /** * Get the string length, not the byte-length! * * INFO: use UTF8::strwidth() for the char-length * * EXAMPLE: UTF8::strlen("Iñtërnâtiôn\xE9àlizætiøn")); // 20 * * @see http://php.net/manual/en/function.mb-strlen.php * * @param string $str

The string being checked for length.

* @param string $encoding [optional]

Set the charset for e.g. "mb_" function

* @param bool $clean_utf8 [optional]

Remove non UTF-8 chars from the string.

* * @psalm-pure * * @return false|int *

* The number (int) of characters in the string $str having character encoding * $encoding. * (One multi-byte character counted as +1). *
* Can return false, if e.g. mbstring is not installed and we process invalid * chars. *

* * @phpstan-return false|0|positive-int */ public static function strlen( string $str, string $encoding = 'UTF-8', bool $clean_utf8 = false ) { if ($str === '') { return 0; } if ($encoding !== 'UTF-8' && $encoding !== 'CP850') { $encoding = self::normalize_encoding($encoding, 'UTF-8'); } if ($clean_utf8) { // "mb_strlen" and "\iconv_strlen" returns wrong length, // if invalid characters are found in $str $str = self::clean($str); } // // fallback via mbstring // if (self::$SUPPORT['mbstring'] === true) { if ($encoding === 'UTF-8') { /** @noinspection PhpUsageOfSilenceOperatorInspection - ignore warnings, it's working anyway */ return @\mb_strlen($str); } /** @noinspection PhpUsageOfSilenceOperatorInspection - ignore warnings, it's working anyway */ return @\mb_strlen($str, $encoding); } // // fallback for binary || ascii only // if ( $encoding === 'CP850' || $encoding === 'ASCII' ) { return \strlen($str); } if ( $encoding !== 'UTF-8' && self::$SUPPORT['mbstring'] === false && self::$SUPPORT['iconv'] === false ) { /** * @psalm-suppress ImpureFunctionCall - this is only a warning */ \trigger_error('UTF8::strlen() without mbstring / iconv cannot handle "' . $encoding . '" encoding', \E_USER_WARNING); } // // fallback via iconv // if (self::$SUPPORT['iconv'] === true) { $return_tmp = \iconv_strlen($str, $encoding); if ($return_tmp !== false) { return $return_tmp; } } // // fallback via intl // if ( $encoding === 'UTF-8' // INFO: "grapheme_strlen()" can't handle other encodings && self::$SUPPORT['intl'] === true ) { $return_tmp = \grapheme_strlen($str); /* @phpstan-ignore-next-line | "grapheme_strlen" will maybe return "null" for empty-strings and "false" on error */ if ($return_tmp !== false && $return_tmp !== null) { return $return_tmp; } } // // fallback for ascii only // if (ASCII::is_ascii($str)) { return \strlen($str); } // // fallback via vanilla php // \preg_match_all('/./us', $str, $parts); $return_tmp = \count($parts[0]); if ($return_tmp === 0) { return false; } return $return_tmp; } /** * Get string length in byte. * * @param string $str * * @psalm-pure * * @return int * * @phpstan-return 0|positive-int */ public static function strlen_in_byte(string $str): int { if ($str === '') { return 0; } if (self::$SUPPORT['mbstring_func_overload'] === true) { // "mb_" is available if overload is used, so use it ... return \mb_strlen($str, 'CP850'); // 8-BIT } return \strlen($str); } /** * Case-insensitive string comparisons using a "natural order" algorithm. * * INFO: natural order version of UTF8::strcasecmp() * * EXAMPLES:


     * UTF8::strnatcasecmp('2', '10Hello WORLD 中文空白!'); // -1
     * UTF8::strcasecmp('2Hello world 中文空白!', '10Hello WORLD 中文空白!'); // 1
     *
     * UTF8::strnatcasecmp('10Hello world 中文空白!', '2Hello WORLD 中文空白!'); // 1
     * UTF8::strcasecmp('10Hello world 中文空白!', '2Hello WORLD 中文空白!'); // -1
     *

* * @param string $str1

The first string.

* @param string $str2

The second string.

* @param string $encoding [optional]

Set the charset for e.g. "mb_" function

* * @psalm-pure * * @return int * < 0 if str1 is less than str2
* > 0 if str1 is greater than str2
* 0 if they are equal */ public static function strnatcasecmp(string $str1, string $str2, string $encoding = 'UTF-8'): int { return self::strnatcmp( self::strtocasefold($str1, true, false, $encoding, null, false), self::strtocasefold($str2, true, false, $encoding, null, false) ); } /** * String comparisons using a "natural order" algorithm * * INFO: natural order version of UTF8::strcmp() * * EXAMPLES:


     * UTF8::strnatcmp('2Hello world 中文空白!', '10Hello WORLD 中文空白!'); // -1
     * UTF8::strcmp('2Hello world 中文空白!', '10Hello WORLD 中文空白!'); // 1
     *
     * UTF8::strnatcmp('10Hello world 中文空白!', '2Hello WORLD 中文空白!'); // 1
     * UTF8::strcmp('10Hello world 中文空白!', '2Hello WORLD 中文空白!'); // -1
     *

* * @see http://php.net/manual/en/function.strnatcmp.php * * @param string $str1

The first string.

* @param string $str2

The second string.

* * @psalm-pure * * @return int * < 0 if str1 is less than str2;
* > 0 if str1 is greater than str2;
* 0 if they are equal */ public static function strnatcmp(string $str1, string $str2): int { if ($str1 === $str2) { return 0; } return \strnatcmp( (string) self::strtonatfold($str1), (string) self::strtonatfold($str2) ); } /** * Case-insensitive string comparison of the first n characters. * * EXAMPLE:


     * UTF8::strcasecmp("iñtërnâtiôn\nàlizætiøn321", "iñtërnâtiôn\nàlizætiøn123", 5); // 0
     *

* * @see http://php.net/manual/en/function.strncasecmp.php * * @param string $str1

The first string.

* @param string $str2

The second string.

* @param int $len

The length of strings to be used in the comparison.

* @param string $encoding [optional]

Set the charset for e.g. "mb_" function

* * @psalm-pure * * @return int * < 0 if str1 is less than str2;
* > 0 if str1 is greater than str2;
* 0 if they are equal */ public static function strncasecmp( string $str1, string $str2, int $len, string $encoding = 'UTF-8' ): int { return self::strncmp( self::strtocasefold($str1, true, false, $encoding, null, false), self::strtocasefold($str2, true, false, $encoding, null, false), $len ); } /** * String comparison of the first n characters. * * EXAMPLE:


     * UTF8::strncmp("Iñtërnâtiôn\nàlizætiøn321", "Iñtërnâtiôn\nàlizætiøn123", 5); // 0
     *

* * @see http://php.net/manual/en/function.strncmp.php * * @param string $str1

The first string.

* @param string $str2

The second string.

* @param int $len

Number of characters to use in the comparison.

* @param string $encoding [optional]

Set the charset for e.g. "mb_" function

* * @psalm-pure * * @return int * < 0 if str1 is less than str2;
* > 0 if str1 is greater than str2;
* 0 if they are equal */ public static function strncmp( string $str1, string $str2, int $len, string $encoding = 'UTF-8' ): int { if ($encoding !== 'UTF-8' && $encoding !== 'CP850') { $encoding = self::normalize_encoding($encoding, 'UTF-8'); } if ($encoding === 'UTF-8') { $str1 = (string) \mb_substr($str1, 0, $len); $str2 = (string) \mb_substr($str2, 0, $len); } else { $str1 = (string) self::substr($str1, 0, $len, $encoding); $str2 = (string) self::substr($str2, 0, $len, $encoding); } return self::strcmp($str1, $str2); } /** * Search a string for any of a set of characters. * * EXAMPLE: UTF8::strpbrk('-中文空白-', '白'); // '白-' * * @see http://php.net/manual/en/function.strpbrk.php * * @param string $haystack

The string where char_list is looked for.

* @param string $char_list

This parameter is case-sensitive.

* * @psalm-pure * * @return false|string *

The string starting from the character found, or false if it is not found.

*/ public static function strpbrk(string $haystack, string $char_list) { if ($haystack === '' || $char_list === '') { return false; } if (\preg_match('/' . self::rxClass($char_list) . '/us', $haystack, $m)) { return \substr($haystack, (int) \strpos($haystack, $m[0])); } return false; } /** * Find the position of the first occurrence of a substring in a string. * * INFO: use UTF8::strpos_in_byte() for the byte-length * * EXAMPLE: UTF8::strpos('ABC-ÖÄÜ-中文空白-中文空白', '中'); // 8 * * @see http://php.net/manual/en/function.mb-strpos.php * * @param string $haystack

The string from which to get the position of the first occurrence of needle.

* @param int|string $needle

The string to find in haystack.
Or a code point as int.

* @param int $offset [optional]

The search offset. If it is not specified, 0 is used.

* @param string $encoding [optional]

Set the charset for e.g. "mb_" function

* @param bool $clean_utf8 [optional]

Remove non UTF-8 chars from the string.

* * @psalm-pure * * @return false|int * The (int) numeric position of the first occurrence of needle in the haystack * string.
If needle is not found it returns false. * * @phpstan-return false|0|positive-int */ public static function strpos( string $haystack, $needle, int $offset = 0, string $encoding = 'UTF-8', bool $clean_utf8 = false ) { if ($haystack === '') { if (\PHP_VERSION_ID >= 80000) { if ($needle === '') { return 0; } } else { return false; } } // iconv and mbstring do not support integer $needle if ((int) $needle === $needle) { $needle = (string) self::chr($needle); } $needle = (string) $needle; if ($haystack === '') { if (\PHP_VERSION_ID >= 80000 && $needle === '') { return 0; } return false; } if ($needle === '' && \PHP_VERSION_ID < 80000) { return false; } if ($clean_utf8) { // "mb_strpos()" and "iconv_strpos()" returns wrong position, // if invalid characters are found in $haystack before $needle $needle = self::clean($needle); $haystack = self::clean($haystack); } if ($encoding !== 'UTF-8' && $encoding !== 'CP850') { $encoding = self::normalize_encoding($encoding, 'UTF-8'); } // // fallback via mbstring // if (self::$SUPPORT['mbstring'] === true) { if ($encoding === 'UTF-8') { /** @noinspection PhpUsageOfSilenceOperatorInspection - Offset not contained in string */ return @\mb_strpos($haystack, $needle, $offset); } /** @noinspection PhpUsageOfSilenceOperatorInspection - Offset not contained in string */ return @\mb_strpos($haystack, $needle, $offset, $encoding); } // // fallback for binary || ascii only // if ( $encoding === 'CP850' || $encoding === 'ASCII' ) { return \strpos($haystack, $needle, $offset); } if ( $encoding !== 'UTF-8' && self::$SUPPORT['iconv'] === false && self::$SUPPORT['mbstring'] === false ) { /** * @psalm-suppress ImpureFunctionCall - this is only a warning */ \trigger_error('UTF8::strpos() without mbstring / iconv cannot handle "' . $encoding . '" encoding', \E_USER_WARNING); } // // fallback via intl // if ( $encoding === 'UTF-8' // INFO: "grapheme_strpos()" can't handle other encodings && $offset >= 0 // grapheme_strpos() can't handle negative offset && self::$SUPPORT['intl'] === true ) { $return_tmp = \grapheme_strpos($haystack, $needle, $offset); if ($return_tmp !== false) { return $return_tmp; } } // // fallback via iconv // if ( $offset >= 0 // iconv_strpos() can't handle negative offset && self::$SUPPORT['iconv'] === true ) { // ignore invalid negative offset to keep compatibility // with php < 5.5.35, < 5.6.21, < 7.0.6 $return_tmp = \iconv_strpos($haystack, $needle, $offset > 0 ? $offset : 0, $encoding); if ($return_tmp !== false) { return $return_tmp; } } // // fallback for ascii only // if (ASCII::is_ascii($haystack . $needle)) { /** @noinspection PhpUsageOfSilenceOperatorInspection - Offset not contained in string */ return @\strpos($haystack, $needle, $offset); } // // fallback via vanilla php // $haystack_tmp = self::substr($haystack, $offset, null, $encoding); if ($haystack_tmp === false) { $haystack_tmp = ''; } $haystack = (string) $haystack_tmp; if ($offset < 0) { $offset = 0; } $pos = \strpos($haystack, $needle); if ($pos === false) { return false; } if ($pos) { return $offset + (int) self::strlen(\substr($haystack, 0, $pos), $encoding); } return $offset + 0; } /** * Find the position of the first occurrence of a substring in a string. * * @param string $haystack

* The string being checked. *

* @param string $needle

* The position counted from the beginning of haystack. *

* @param int $offset [optional]

* The search offset. If it is not specified, 0 is used. *

* * @psalm-pure * * @return false|int *

The numeric position of the first occurrence of needle in the * haystack string. If needle is not found, it returns false.

* * @phpstan-return false|0|positive-int */ public static function strpos_in_byte(string $haystack, string $needle, int $offset = 0) { if ($haystack === '' || $needle === '') { return false; } if (self::$SUPPORT['mbstring_func_overload'] === true) { // "mb_" is available if overload is used, so use it ... return \mb_strpos($haystack, $needle, $offset, 'CP850'); // 8-BIT } return \strpos($haystack, $needle, $offset); } /** * Find the position of the first occurrence of a substring in a string, case-insensitive. * * @param string $haystack

* The string being checked. *

* @param string $needle

* The position counted from the beginning of haystack. *

* @param int $offset [optional]

* The search offset. If it is not specified, 0 is used. *

* * @psalm-pure * * @return false|int *

The numeric position of the first occurrence of needle in the * haystack string. If needle is not found, it returns false.

* * @phpstan-return false|0|positive-int */ public static function stripos_in_byte(string $haystack, string $needle, int $offset = 0) { if ($haystack === '' || $needle === '') { return false; } if (self::$SUPPORT['mbstring_func_overload'] === true) { // "mb_" is available if overload is used, so use it ... return \mb_stripos($haystack, $needle, $offset, 'CP850'); // 8-BIT } return \stripos($haystack, $needle, $offset); } /** * Find the last occurrence of a character in a string within another. * * EXAMPLE: UTF8::strrchr('κόσμεκόσμε-äöü', 'κόσμε'); // 'κόσμε-äöü' * * @see http://php.net/manual/en/function.mb-strrchr.php * * @param string $haystack

The string from which to get the last occurrence of needle.

* @param string $needle

The string to find in haystack

* @param bool $before_needle [optional]

* Determines which portion of haystack * this function returns. * If set to true, it returns all of haystack * from the beginning to the last occurrence of needle. * If set to false, it returns all of haystack * from the last occurrence of needle to the end, *

* @param string $encoding [optional]

Set the charset for e.g. "mb_" function

* @param bool $clean_utf8 [optional]

Remove non UTF-8 chars from the string.

* * @psalm-pure * * @return false|string *

The portion of haystack or false if needle is not found.

*/ public static function strrchr( string $haystack, string $needle, bool $before_needle = false, string $encoding = 'UTF-8', bool $clean_utf8 = false ) { if ($haystack === '' || $needle === '') { return false; } if ($encoding !== 'UTF-8' && $encoding !== 'CP850') { $encoding = self::normalize_encoding($encoding, 'UTF-8'); } if ($clean_utf8) { // "mb_strpos()" and "iconv_strpos()" returns wrong position, // if invalid characters are found in $haystack before $needle $needle = self::clean($needle); $haystack = self::clean($haystack); } // // fallback via mbstring // if (self::$SUPPORT['mbstring'] === true) { if ($encoding === 'UTF-8') { return \mb_strrchr($haystack, $needle, $before_needle); } return \mb_strrchr($haystack, $needle, $before_needle, $encoding); } // // fallback for binary || ascii only // if ( !$before_needle && ( $encoding === 'CP850' || $encoding === 'ASCII' ) ) { return \strrchr($haystack, $needle); } if ( $encoding !== 'UTF-8' && self::$SUPPORT['mbstring'] === false ) { /** * @psalm-suppress ImpureFunctionCall - this is only a warning */ \trigger_error('UTF8::strrchr() without mbstring cannot handle "' . $encoding . '" encoding', \E_USER_WARNING); } // // fallback via iconv // if (self::$SUPPORT['iconv'] === true) { $needle_tmp = self::substr($needle, 0, 1, $encoding); if ($needle_tmp === false) { return false; } $needle = $needle_tmp; $pos = \iconv_strrpos($haystack, $needle, $encoding); if ($pos === false) { return false; } if ($before_needle) { return self::substr($haystack, 0, $pos, $encoding); } return self::substr($haystack, $pos, null, $encoding); } // // fallback via vanilla php // $needle_tmp = self::substr($needle, 0, 1, $encoding); if ($needle_tmp === false) { return false; } $needle = $needle_tmp; $pos = self::strrpos($haystack, $needle, 0, $encoding); if ($pos === false) { return false; } if ($before_needle) { return self::substr($haystack, 0, $pos, $encoding); } return self::substr($haystack, $pos, null, $encoding); } /** * Reverses characters order in the string. * * EXAMPLE: UTF8::strrev('κ-öäü'); // 'üäö-κ' * * @param string $str

The input string.

* @param string $encoding [optional]

Set the charset for e.g. "mb_" function

* * @psalm-pure * * @return string *

The string with characters in the reverse sequence.

*/ public static function strrev(string $str, string $encoding = 'UTF-8'): string { if ($str === '') { return ''; } // init $reversed = ''; $str = self::emoji_encode($str, true); if ($encoding === 'UTF-8') { if (self::$SUPPORT['intl'] === true) { // try "grapheme" first: https://stackoverflow.com/questions/17496493/strrev-dosent-support-utf-8 $i = (int) \grapheme_strlen($str); while ($i--) { $reversed_tmp = \grapheme_substr($str, $i, 1); if ($reversed_tmp !== false) { $reversed .= $reversed_tmp; } } } else { $i = (int) \mb_strlen($str); while ($i--) { $reversed_tmp = \mb_substr($str, $i, 1); if ($reversed_tmp !== false) { /* @phpstan-ignore-line | old polyfill will return false, or? */ $reversed .= $reversed_tmp; } } } } else { $encoding = self::normalize_encoding($encoding, 'UTF-8'); $i = (int) self::strlen($str, $encoding); while ($i--) { $reversed_tmp = self::substr($str, $i, 1, $encoding); if ($reversed_tmp !== false) { $reversed .= $reversed_tmp; } } } return self::emoji_decode($reversed, true); } /** * Find the last occurrence of a character in a string within another, case-insensitive. * * EXAMPLE: UTF8::strrichr('Aκόσμεκόσμε-äöü', 'aκόσμε'); // 'Aκόσμεκόσμε-äöü' * * @see http://php.net/manual/en/function.mb-strrichr.php * * @param string $haystack

The string from which to get the last occurrence of needle.

* @param string $needle

The string to find in haystack.

* @param bool $before_needle [optional]

* @param string $encoding [optional]

Set the charset for e.g. "mb_" function

* @param bool $clean_utf8 [optional]

Remove non UTF-8 chars from the string.

* * @psalm-pure * * @return false|string *

The portion of haystack or
false if needle is not found.

*/ public static function strrichr( string $haystack, string $needle, bool $before_needle = false, string $encoding = 'UTF-8', bool $clean_utf8 = false ) { if ($haystack === '' || $needle === '') { return false; } if ($encoding !== 'UTF-8' && $encoding !== 'CP850') { $encoding = self::normalize_encoding($encoding, 'UTF-8'); } if ($clean_utf8) { // "mb_strpos()" and "iconv_strpos()" returns wrong position, // if invalid characters are found in $haystack before $needle $needle = self::clean($needle); $haystack = self::clean($haystack); } // // fallback via mbstring // if (self::$SUPPORT['mbstring'] === true) { if ($encoding === 'UTF-8') { return \mb_strrichr($haystack, $needle, $before_needle); } return \mb_strrichr($haystack, $needle, $before_needle, $encoding); } // // fallback via vanilla php // $needle_tmp = self::substr($needle, 0, 1, $encoding); if ($needle_tmp === false) { return false; } $needle = $needle_tmp; $pos = self::strripos($haystack, $needle, 0, $encoding); if ($pos === false) { return false; } if ($before_needle) { return self::substr($haystack, 0, $pos, $encoding); } return self::substr($haystack, $pos, null, $encoding); } /** * Find the position of the last occurrence of a substring in a string, case-insensitive. * * EXAMPLE: UTF8::strripos('ABC-ÖÄÜ-中文空白-中文空白', '中'); // 13 * * @param string $haystack

The string to look in.

* @param int|string $needle

The string to look for.

* @param int $offset [optional]

Number of characters to ignore in the beginning or end.

* @param string $encoding [optional]

Set the charset for e.g. "mb_" function

* @param bool $clean_utf8 [optional]

Remove non UTF-8 chars from the string.

* * @psalm-pure * * @return false|int *

The (int) numeric position of the last occurrence of needle in the haystack * string.
If needle is not found, it returns false.

*/ public static function strripos( string $haystack, $needle, int $offset = 0, string $encoding = 'UTF-8', bool $clean_utf8 = false ) { if ($haystack === '') { if (\PHP_VERSION_ID >= 80000) { if ($needle === '') { return 0; } } else { return false; } } // iconv and mbstring do not support integer $needle if ((int) $needle === $needle && $needle >= 0) { $needle = (string) self::chr($needle); } $needle = (string) $needle; if ($haystack === '') { if (\PHP_VERSION_ID >= 80000 && $needle === '') { return 0; } return false; } if ($needle === '' && \PHP_VERSION_ID < 80000) { return false; } if ($clean_utf8) { // mb_strripos() && iconv_strripos() is not tolerant to invalid characters $needle = self::clean($needle); $haystack = self::clean($haystack); } if ($encoding !== 'UTF-8' && $encoding !== 'CP850') { $encoding = self::normalize_encoding($encoding, 'UTF-8'); } // // fallback via mbstrig // if (self::$SUPPORT['mbstring'] === true) { if ($encoding === 'UTF-8') { return \mb_strripos($haystack, $needle, $offset); } return \mb_strripos($haystack, $needle, $offset, $encoding); } // // fallback for binary || ascii only // if ( $encoding === 'CP850' || $encoding === 'ASCII' ) { return \strripos($haystack, $needle, $offset); } if ( $encoding !== 'UTF-8' && self::$SUPPORT['mbstring'] === false ) { /** * @psalm-suppress ImpureFunctionCall - this is only a warning */ \trigger_error('UTF8::strripos() without mbstring cannot handle "' . $encoding . '" encoding', \E_USER_WARNING); } // // fallback via intl // if ( $encoding === 'UTF-8' // INFO: "grapheme_strripos()" can't handle other encodings && $offset >= 0 // grapheme_strripos() can't handle negative offset && self::$SUPPORT['intl'] === true ) { $return_tmp = \grapheme_strripos($haystack, $needle, $offset); if ($return_tmp !== false) { return $return_tmp; } } // // fallback for ascii only // if (ASCII::is_ascii($haystack . $needle)) { return \strripos($haystack, $needle, $offset); } // // fallback via vanilla php // $haystack = self::strtocasefold($haystack, true, false, $encoding); $needle = self::strtocasefold($needle, true, false, $encoding); return self::strrpos($haystack, $needle, $offset, $encoding, $clean_utf8); } /** * Finds position of last occurrence of a string within another, case-insensitive. * * @param string $haystack

* The string from which to get the position of the last occurrence * of needle. *

* @param string $needle

* The string to find in haystack. *

* @param int $offset [optional]

* The position in haystack * to start searching. *

* * @psalm-pure * * @return false|int *

eturn the numeric position of the last occurrence of needle in the * haystack string, or false if needle is not found.

*/ public static function strripos_in_byte(string $haystack, string $needle, int $offset = 0) { if ($haystack === '' || $needle === '') { return false; } if (self::$SUPPORT['mbstring_func_overload'] === true) { // "mb_" is available if overload is used, so use it ... return \mb_strripos($haystack, $needle, $offset, 'CP850'); // 8-BIT } return \strripos($haystack, $needle, $offset); } /** * Find the position of the last occurrence of a substring in a string. * * EXAMPLE: UTF8::strrpos('ABC-ÖÄÜ-中文空白-中文空白', '中'); // 13 * * @see http://php.net/manual/en/function.mb-strrpos.php * * @param string $haystack

The string being checked, for the last occurrence of needle

* @param int|string $needle

The string to find in haystack.
Or a code point as int.

* @param int $offset [optional]

May be specified to begin searching an arbitrary number of characters * into the string. Negative values will stop searching at an arbitrary point prior to * the end of the string. *

* @param string $encoding [optional]

Set the charset.

* @param bool $clean_utf8 [optional]

Remove non UTF-8 chars from the string.

* * @psalm-pure * * @return false|int *

The (int) numeric position of the last occurrence of needle in the haystack * string.
If needle is not found, it returns false.

*/ public static function strrpos( string $haystack, $needle, int $offset = 0, string $encoding = 'UTF-8', bool $clean_utf8 = false ) { if ($haystack === '') { if (\PHP_VERSION_ID >= 80000) { if ($needle === '') { return 0; } } else { return false; } } // iconv and mbstring do not support integer $needle if ((int) $needle === $needle && $needle >= 0) { $needle = (string) self::chr($needle); } $needle = (string) $needle; if ($haystack === '') { if (\PHP_VERSION_ID >= 80000 && $needle === '') { return 0; } return false; } if ($needle === '' && \PHP_VERSION_ID < 80000) { return false; } if ($clean_utf8) { // mb_strrpos && iconv_strrpos is not tolerant to invalid characters $needle = self::clean($needle); $haystack = self::clean($haystack); } if ($encoding !== 'UTF-8' && $encoding !== 'CP850') { $encoding = self::normalize_encoding($encoding, 'UTF-8'); } // // fallback via mbstring // if (self::$SUPPORT['mbstring'] === true) { if ($encoding === 'UTF-8') { return \mb_strrpos($haystack, $needle, $offset); } return \mb_strrpos($haystack, $needle, $offset, $encoding); } // // fallback for binary || ascii only // if ( $encoding === 'CP850' || $encoding === 'ASCII' ) { return \strrpos($haystack, $needle, $offset); } if ( $encoding !== 'UTF-8' && self::$SUPPORT['mbstring'] === false ) { /** * @psalm-suppress ImpureFunctionCall - this is only a warning */ \trigger_error('UTF8::strrpos() without mbstring cannot handle "' . $encoding . '" encoding', \E_USER_WARNING); } // // fallback via intl // if ( $offset >= 0 // grapheme_strrpos() can't handle negative offset && $encoding === 'UTF-8' // INFO: "grapheme_strrpos()" can't handle other encodings && self::$SUPPORT['intl'] === true ) { $return_tmp = \grapheme_strrpos($haystack, $needle, $offset); if ($return_tmp !== false) { return $return_tmp; } } // // fallback for ascii only // if (ASCII::is_ascii($haystack . $needle)) { return \strrpos($haystack, $needle, $offset); } // // fallback via vanilla php // $haystack_tmp = null; if ($offset > 0) { $haystack_tmp = self::substr($haystack, $offset); } elseif ($offset < 0) { $haystack_tmp = self::substr($haystack, 0, $offset); $offset = 0; } if ($haystack_tmp !== null) { if ($haystack_tmp === false) { $haystack_tmp = ''; } $haystack = (string) $haystack_tmp; } $pos = \strrpos($haystack, $needle); if ($pos === false) { return false; } /** @var false|string $str_tmp - needed for PhpStan (stubs error) */ $str_tmp = \substr($haystack, 0, $pos); if ($str_tmp === false) { return false; } return $offset + (int) self::strlen($str_tmp); } /** * Find the position of the last occurrence of a substring in a string. * * @param string $haystack

* The string being checked, for the last occurrence * of needle. *

* @param string $needle

* The string to find in haystack. *

* @param int $offset [optional]

May be specified to begin searching an arbitrary number of characters into * the string. Negative values will stop searching at an arbitrary point * prior to the end of the string. *

* * @psalm-pure * * @return false|int *

The numeric position of the last occurrence of needle in the * haystack string. If needle is not found, it returns false.

*/ public static function strrpos_in_byte(string $haystack, string $needle, int $offset = 0) { if ($haystack === '' || $needle === '') { return false; } if (self::$SUPPORT['mbstring_func_overload'] === true) { // "mb_" is available if overload is used, so use it ... return \mb_strrpos($haystack, $needle, $offset, 'CP850'); // 8-BIT } return \strrpos($haystack, $needle, $offset); } /** * Finds the length of the initial segment of a string consisting entirely of characters contained within a given * mask. * * EXAMPLE: UTF8::strspn('iñtërnâtiônàlizætiøn', 'itñ'); // '3' * * @param string $str

The input string.

* @param string $mask

The mask of chars

* @param int $offset [optional] * @param int|null $length [optional] * @param string $encoding [optional]

Set the charset.

* * @psalm-pure * * @return false|int */ public static function strspn( string $str, string $mask, int $offset = 0, int $length = null, string $encoding = 'UTF-8' ) { if ($encoding !== 'UTF-8' && $encoding !== 'CP850') { $encoding = self::normalize_encoding($encoding, 'UTF-8'); } if ($offset || $length !== null) { if ($encoding === 'UTF-8') { if ($length === null) { $str = (string) \mb_substr($str, $offset); } else { $str = (string) \mb_substr($str, $offset, $length); } } else { $str = (string) self::substr($str, $offset, $length, $encoding); } } if ($str === '' || $mask === '') { return 0; } $matches = []; return \preg_match('/^' . self::rxClass($mask) . '+/u', $str, $matches) ? (int) self::strlen($matches[0], $encoding) : 0; } /** * Returns part of haystack string from the first occurrence of needle to the end of haystack. * * EXAMPLE:


     * $str = 'iñtërnâtiônàlizætiøn';
     * $search = 'nât';
     *
     * UTF8::strstr($str, $search)); // 'nâtiônàlizætiøn'
     * UTF8::strstr($str, $search, true)); // 'iñtër'
     *

* * @param string $haystack

The input string. Must be valid UTF-8.

* @param string $needle

The string to look for. Must be valid UTF-8.

* @param bool $before_needle [optional]

* If TRUE, strstr() returns the part of the * haystack before the first occurrence of the needle (excluding the needle). *

* @param string $encoding [optional]

Set the charset for e.g. "mb_" function

* @param bool $clean_utf8 [optional]

Remove non UTF-8 chars from the string.

* * @psalm-pure * * @return false|string *

A sub-string,
or false if needle is not found.

*/ public static function strstr( string $haystack, string $needle, bool $before_needle = false, string $encoding = 'UTF-8', bool $clean_utf8 = false ) { if ($haystack === '') { if (\PHP_VERSION_ID >= 80000 && $needle === '') { return ''; } return false; } if ($clean_utf8) { // "mb_strpos()" and "iconv_strpos()" returns wrong position, // if invalid characters are found in $haystack before $needle $needle = self::clean($needle); $haystack = self::clean($haystack); } if ($needle === '') { if (\PHP_VERSION_ID >= 80000) { return $haystack; } return false; } if ($encoding !== 'UTF-8' && $encoding !== 'CP850') { $encoding = self::normalize_encoding($encoding, 'UTF-8'); } // // fallback via mbstring // if (self::$SUPPORT['mbstring'] === true) { if ($encoding === 'UTF-8') { return \mb_strstr($haystack, $needle, $before_needle); } return \mb_strstr($haystack, $needle, $before_needle, $encoding); } // // fallback for binary || ascii only // if ( $encoding === 'CP850' || $encoding === 'ASCII' ) { return \strstr($haystack, $needle, $before_needle); } if ( $encoding !== 'UTF-8' && self::$SUPPORT['mbstring'] === false ) { /** * @psalm-suppress ImpureFunctionCall - this is only a warning */ \trigger_error('UTF8::strstr() without mbstring cannot handle "' . $encoding . '" encoding', \E_USER_WARNING); } // // fallback via intl // if ( $encoding === 'UTF-8' // INFO: "grapheme_strstr()" can't handle other encodings && self::$SUPPORT['intl'] === true ) { $return_tmp = \grapheme_strstr($haystack, $needle, $before_needle); if ($return_tmp !== false) { return $return_tmp; } } // // fallback for ascii only // if (ASCII::is_ascii($haystack . $needle)) { return \strstr($haystack, $needle, $before_needle); } // // fallback via vanilla php // \preg_match('/^(.*?)' . \preg_quote($needle, '/') . '/us', $haystack, $match); if (!isset($match[1])) { return false; } if ($before_needle) { return $match[1]; } return self::substr($haystack, (int) self::strlen($match[1])); } /** * Finds first occurrence of a string within another. * * @param string $haystack

* The string from which to get the first occurrence * of needle. *

* @param string $needle

* The string to find in haystack. *

* @param bool $before_needle [optional]

* Determines which portion of haystack * this function returns. * If set to true, it returns all of haystack * from the beginning to the first occurrence of needle. * If set to false, it returns all of haystack * from the first occurrence of needle to the end, *

* * @psalm-pure * * @return false|string *

The portion of haystack, * or false if needle is not found.

*/ public static function strstr_in_byte( string $haystack, string $needle, bool $before_needle = false ) { if ($haystack === '' || $needle === '') { return false; } if (self::$SUPPORT['mbstring_func_overload'] === true) { // "mb_" is available if overload is used, so use it ... return \mb_strstr($haystack, $needle, $before_needle, 'CP850'); // 8-BIT } return \strstr($haystack, $needle, $before_needle); } /** * Unicode transformation for case-less matching. * * EXAMPLE: UTF8::strtocasefold('ǰ◌̱'); // 'ǰ◌̱' * * @see http://unicode.org/reports/tr21/tr21-5.html * * @param string $str

The input string.

* @param bool $full [optional]

* true, replace full case folding chars (default)
* false, use only limited static array [UTF8::$COMMON_CASE_FOLD] *

* @param bool $clean_utf8 [optional]

Remove non UTF-8 chars from the string.

* @param string $encoding [optional]

Set the charset.

* @param string|null $lang [optional]

Set the language for special cases: az, el, lt, tr

* @param bool $lower [optional]

Use lowercase string, otherwise use uppercase string. PS: uppercase * is for some languages better ...

* * @psalm-pure * * @return string */ public static function strtocasefold( string $str, bool $full = true, bool $clean_utf8 = false, string $encoding = 'UTF-8', string $lang = null, bool $lower = true ): string { if ($str === '') { return ''; } if ($clean_utf8) { // "mb_strpos()" and "iconv_strpos()" returns wrong position, // if invalid characters are found in $haystack before $needle $str = self::clean($str); } $str = self::fixStrCaseHelper($str, $lower, $full); if ($lang === null && $encoding === 'UTF-8') { if ($lower) { return \mb_strtolower($str); } return \mb_strtoupper($str); } if ($lower) { return self::strtolower($str, $encoding, false, $lang); } return self::strtoupper($str, $encoding, false, $lang); } /** * Make a string lowercase. * * EXAMPLE: UTF8::strtolower('DÉJÀ Σσς Iıİi'); // 'déjà σσς iıii' * * @see http://php.net/manual/en/function.mb-strtolower.php * * @param string $str

The string being lowercased.

* @param string $encoding [optional]

Set the charset for e.g. "mb_" function

* @param bool $clean_utf8 [optional]

Remove non UTF-8 chars from the string.

* @param string|null $lang [optional]

Set the language for special cases: az, el, lt, * tr

* @param bool $try_to_keep_the_string_length [optional]

true === try to keep the string length: e.g. ẞ * -> ß

* * @psalm-pure * * @return string *

String with all alphabetic characters converted to lowercase.

*/ public static function strtolower( $str, string $encoding = 'UTF-8', bool $clean_utf8 = false, string $lang = null, bool $try_to_keep_the_string_length = false ): string { // init $str = (string) $str; if ($str === '') { return ''; } if ($clean_utf8) { // "mb_strpos()" and "iconv_strpos()" returns wrong position, // if invalid characters are found in $haystack before $needle $str = self::clean($str); } // hack for old php version or for the polyfill ... if ($try_to_keep_the_string_length) { $str = self::fixStrCaseHelper($str, true); } if ($lang === null && $encoding === 'UTF-8') { return \mb_strtolower($str); } $encoding = self::normalize_encoding($encoding, 'UTF-8'); if ($lang !== null) { if (self::$SUPPORT['intl'] === true) { if (self::$INTL_TRANSLITERATOR_LIST === null) { self::$INTL_TRANSLITERATOR_LIST = self::getData('transliterator_list'); } $language_code = $lang . '-Lower'; if (!\in_array($language_code, self::$INTL_TRANSLITERATOR_LIST, true)) { /** * @psalm-suppress ImpureFunctionCall - this is only a warning */ \trigger_error('UTF8::strtolower() cannot handle special language: ' . $lang . ' | supported: ' . \print_r(self::$INTL_TRANSLITERATOR_LIST, true), \E_USER_WARNING); $language_code = 'Any-Lower'; } return (string) \transliterator_transliterate($language_code, $str); } /** * @psalm-suppress ImpureFunctionCall - this is only a warning */ \trigger_error('UTF8::strtolower() without intl cannot handle the "lang" parameter: ' . $lang, \E_USER_WARNING); } // always fallback via symfony polyfill return \mb_strtolower($str, $encoding); } /** * Make a string uppercase. * * EXAMPLE: UTF8::strtoupper('Déjà Σσς Iıİi'); // 'DÉJÀ ΣΣΣ IIİI' * * @see http://php.net/manual/en/function.mb-strtoupper.php * * @param string $str

The string being uppercased.

* @param string $encoding [optional]

Set the charset.

* @param bool $clean_utf8 [optional]

Remove non UTF-8 chars from the string.

* @param string|null $lang [optional]

Set the language for special cases: az, el, lt, * tr

* @param bool $try_to_keep_the_string_length [optional]

true === try to keep the string length: e.g. ẞ * -> ß

* * @psalm-pure * * @return string *

String with all alphabetic characters converted to uppercase.

*/ public static function strtoupper( $str, string $encoding = 'UTF-8', bool $clean_utf8 = false, string $lang = null, bool $try_to_keep_the_string_length = false ): string { // init $str = (string) $str; if ($str === '') { return ''; } if ($clean_utf8) { // "mb_strpos()" and "iconv_strpos()" returns wrong position, // if invalid characters are found in $haystack before $needle $str = self::clean($str); } // hack for old php version or for the polyfill ... if ($try_to_keep_the_string_length) { $str = self::fixStrCaseHelper($str); } if ($lang === null && $encoding === 'UTF-8') { return \mb_strtoupper($str); } $encoding = self::normalize_encoding($encoding, 'UTF-8'); if ($lang !== null) { if (self::$SUPPORT['intl'] === true) { if (self::$INTL_TRANSLITERATOR_LIST === null) { self::$INTL_TRANSLITERATOR_LIST = self::getData('transliterator_list'); } $language_code = $lang . '-Upper'; if (!\in_array($language_code, self::$INTL_TRANSLITERATOR_LIST, true)) { /** * @psalm-suppress ImpureFunctionCall - this is only a warning */ \trigger_error('UTF8::strtoupper() without intl for special language: ' . $lang, \E_USER_WARNING); $language_code = 'Any-Upper'; } return (string) \transliterator_transliterate($language_code, $str); } /** * @psalm-suppress ImpureFunctionCall - this is only a warning */ \trigger_error('UTF8::strtolower() without intl cannot handle the "lang"-parameter: ' . $lang, \E_USER_WARNING); } // always fallback via symfony polyfill return \mb_strtoupper($str, $encoding); } /** * Translate characters or replace sub-strings. * * EXAMPLE: *


     * $array = [
     *     'Hello'   => '○●◎',
     *     '中文空白' => 'earth',
     * ];
     * UTF8::strtr('Hello 中文空白', $array); // '○●◎ earth'
     *

* * @see http://php.net/manual/en/function.strtr.php * * @param string $str

The string being translated.

* @param string|string[] $from

The string replacing from.

* @param string|string[] $to [optional]

The string being translated to to.

* * @psalm-pure * * @return string *

This function returns a copy of str, translating all occurrences of each character in "from" * to the corresponding character in "to".

*/ public static function strtr(string $str, $from, $to = ''): string { if ($str === '') { return ''; } if ($from === $to) { return $str; } if ($to !== '') { if (!\is_array($from)) { $from = self::str_split($from); } if (!\is_array($to)) { $to = self::str_split($to); } $count_from = \count($from); $count_to = \count($to); if ($count_from !== $count_to) { if ($count_from > $count_to) { $from = \array_slice($from, 0, $count_to); } elseif ($count_from < $count_to) { $to = \array_slice($to, 0, $count_from); } } try { $from = \array_combine($from, $to); } catch (\Error $e) { // PHP >= 8.0 : array_combine() will now throw a ValueError if the number of elements for each array is not equal; previously this function returned false instead. $from = false; } if ($from === false) { throw new \InvalidArgumentException('The number of elements for each array isn\'t equal or the arrays are empty: (from: ' . \print_r($from, true) . ' | to: ' . \print_r($to, true) . ')'); } } if (\is_string($from)) { return \str_replace($from, $to, $str); } return \strtr($str, $from); } /** * Return the width of a string. * * INFO: use UTF8::strlen() for the byte-length * * EXAMPLE: UTF8::strwidth("Iñtërnâtiôn\xE9àlizætiøn")); // 21 * * @param string $str

The input string.

* @param string $encoding [optional]

Set the charset for e.g. "mb_" function

* @param bool $clean_utf8 [optional]

Remove non UTF-8 chars from the string.

* * @psalm-pure * * @return int * * @phpstan-return 0|positive-int */ public static function strwidth( string $str, string $encoding = 'UTF-8', bool $clean_utf8 = false ): int { if ($str === '') { return 0; } if ($encoding !== 'UTF-8' && $encoding !== 'CP850') { $encoding = self::normalize_encoding($encoding, 'UTF-8'); } if ($clean_utf8) { // iconv and mbstring are not tolerant to invalid encoding // further, their behaviour is inconsistent with that of PHP's substr $str = self::clean($str); } // // fallback via mbstring // if (self::$SUPPORT['mbstring'] === true) { if ($encoding === 'UTF-8') { return \mb_strwidth($str); } return \mb_strwidth($str, $encoding); } // // fallback via vanilla php // if ($encoding !== 'UTF-8') { $str = self::encode('UTF-8', $str, false, $encoding); } $wide = 0; $str = (string) \preg_replace('/[\x{1100}-\x{115F}\x{2329}\x{232A}\x{2E80}-\x{303E}\x{3040}-\x{A4CF}\x{AC00}-\x{D7A3}\x{F900}-\x{FAFF}\x{FE10}-\x{FE19}\x{FE30}-\x{FE6F}\x{FF00}-\x{FF60}\x{FFE0}-\x{FFE6}\x{20000}-\x{2FFFD}\x{30000}-\x{3FFFD}]/u', '', $str, -1, $wide); /* @phpstan-ignore-next-line | should return 0|positive-int */ return ($wide << 1) + (int) self::strlen($str); } /** * Get part of a string. * * EXAMPLE: UTF8::substr('中文空白', 1, 2); // '文空' * * @see http://php.net/manual/en/function.mb-substr.php * * @param string $str

The string being checked.

* @param int $offset

The first position used in str.

* @param int|null $length [optional]

The maximum length of the returned string.

* @param string $encoding [optional]

Set the charset for e.g. "mb_" function

* @param bool $clean_utf8 [optional]

Remove non UTF-8 chars from the string.

* * @psalm-pure * * @return false|string * The portion of str specified by the offset and * length parameters.

If str is shorter than offset * characters long, FALSE will be returned. */ public static function substr( string $str, int $offset = 0, int $length = null, string $encoding = 'UTF-8', bool $clean_utf8 = false ) { // empty string if ($str === '' || $length === 0) { return ''; } if ($clean_utf8) { // iconv and mbstring are not tolerant to invalid encoding // further, their behaviour is inconsistent with that of PHP's substr $str = self::clean($str); } // whole string if (!$offset && $length === null) { return $str; } if ($encoding !== 'UTF-8' && $encoding !== 'CP850') { $encoding = self::normalize_encoding($encoding, 'UTF-8'); } // // fallback via mbstring // if (self::$SUPPORT['mbstring'] === true && $encoding === 'UTF-8') { if ($length === null) { return \mb_substr($str, $offset); } return \mb_substr($str, $offset, $length); } // // fallback for binary || ascii only // if ( $encoding === 'CP850' || $encoding === 'ASCII' ) { if ($length === null) { return \substr($str, $offset); } return \substr($str, $offset, $length); } // otherwise we need the string-length $str_length = 0; if ( $offset || $length === null /* @phpstan-ignore-line | can be NULL here?! */ ) { $str_length = self::strlen($str, $encoding); } // e.g.: invalid chars + mbstring not installed if ($str_length === false) { return false; } // empty string if ($offset === $str_length && !$length) { return ''; } // impossible if ($offset && $offset > $str_length) { return ''; } $length = $length ?? $str_length; if ( $encoding !== 'UTF-8' && self::$SUPPORT['mbstring'] === false ) { /** * @psalm-suppress ImpureFunctionCall - this is only a warning */ \trigger_error('UTF8::substr() without mbstring cannot handle "' . $encoding . '" encoding', \E_USER_WARNING); } // // fallback via intl // if ( $encoding === 'UTF-8' // INFO: "grapheme_substr()" can't handle other encodings && $offset >= 0 // grapheme_substr() can't handle negative offset && self::$SUPPORT['intl'] === true ) { $return_tmp = \grapheme_substr($str, $offset, $length); if ($return_tmp !== false) { return $return_tmp; } } // // fallback via iconv // if ( $length >= 0 // "iconv_substr()" can't handle negative length && self::$SUPPORT['iconv'] === true ) { $return_tmp = \iconv_substr($str, $offset, $length); if ($return_tmp !== false) { return $return_tmp; } } // // fallback for ascii only // if (ASCII::is_ascii($str)) { return \substr($str, $offset, $length); } // // fallback via vanilla php // // split to array, and remove invalid characters // && // extract relevant part, and join to make sting again return \implode('', \array_slice(self::str_split($str), $offset, $length)); } /** * Binary-safe comparison of two strings from an offset, up to a length of characters. * * EXAMPLE: * UTF8::substr_compare("○●◎\r", '●◎', 0, 2); // -1 * UTF8::substr_compare("○●◎\r", '◎●', 1, 2); // 1 * UTF8::substr_compare("○●◎\r", '●◎', 1, 2); // 0 * * * @param string $str1

The main string being compared.

* @param string $str2

The secondary string being compared.

* @param int $offset [optional]

The start position for the comparison. If negative, it starts * counting from the end of the string.

* @param int|null $length [optional]

The length of the comparison. The default value is the largest * of the length of the str compared to the length of main_str less the * offset.

* @param bool $case_insensitivity [optional]

If case_insensitivity is TRUE, comparison is case * insensitive.

* @param string $encoding [optional]

Set the charset for e.g. "mb_" function

* * @psalm-pure * * @return int * < 0 if str1 is less than str2;
* > 0 if str1 is greater than str2,
* 0 if they are equal */ public static function substr_compare( string $str1, string $str2, int $offset = 0, int $length = null, bool $case_insensitivity = false, string $encoding = 'UTF-8' ): int { if ( $offset !== 0 || $length !== null ) { if ($encoding === 'UTF-8') { if ($length === null) { $str1 = (string) \mb_substr($str1, $offset); } else { $str1 = (string) \mb_substr($str1, $offset, $length); } $str2 = (string) \mb_substr($str2, 0, (int) self::strlen($str1)); } else { $encoding = self::normalize_encoding($encoding, 'UTF-8'); $str1 = (string) self::substr($str1, $offset, $length, $encoding); $str2 = (string) self::substr($str2, 0, (int) self::strlen($str1), $encoding); } } if ($case_insensitivity) { return self::strcasecmp($str1, $str2, $encoding); } return self::strcmp($str1, $str2); } /** * Count the number of substring occurrences. * * EXAMPLE: UTF8::substr_count('中文空白', '文空', 1, 2); // 1 * * @see http://php.net/manual/en/function.substr-count.php * * @param string $haystack

The string to search in.

* @param string $needle

The substring to search for.

* @param int $offset [optional]

The offset where to start counting.

* @param int|null $length [optional]

* The maximum length after the specified offset to search for the * substring. It outputs a warning if the offset plus the length is * greater than the haystack length. *

* @param string $encoding [optional]

Set the charset for e.g. "mb_" function

* @param bool $clean_utf8 [optional]

Remove non UTF-8 chars from the string.

* * @psalm-pure * * @return false|int *

This functions returns an integer or false if there isn't a string.

*/ public static function substr_count( string $haystack, string $needle, int $offset = 0, int $length = null, string $encoding = 'UTF-8', bool $clean_utf8 = false ) { if ($needle === '') { return false; } if ($haystack === '') { if (\PHP_VERSION_ID >= 80000) { return 0; } return 0; } if ($length === 0) { return 0; } if ($encoding !== 'UTF-8' && $encoding !== 'CP850') { $encoding = self::normalize_encoding($encoding, 'UTF-8'); } if ($clean_utf8) { // "mb_strpos()" and "iconv_strpos()" returns wrong position, // if invalid characters are found in $haystack before $needle $needle = self::clean($needle); $haystack = self::clean($haystack); } if ($offset || $length > 0) { if ($length === null) { $length_tmp = self::strlen($haystack, $encoding); if ($length_tmp === false) { return false; } $length = $length_tmp; } if ($encoding === 'UTF-8') { $haystack = (string) \mb_substr($haystack, $offset, $length); } else { $haystack = (string) \mb_substr($haystack, $offset, $length, $encoding); } } if ( $encoding !== 'UTF-8' && self::$SUPPORT['mbstring'] === false ) { /** * @psalm-suppress ImpureFunctionCall - this is only a warning */ \trigger_error('UTF8::substr_count() without mbstring cannot handle "' . $encoding . '" encoding', \E_USER_WARNING); } if (self::$SUPPORT['mbstring'] === true) { if ($encoding === 'UTF-8') { return \mb_substr_count($haystack, $needle); } return \mb_substr_count($haystack, $needle, $encoding); } \preg_match_all('/' . \preg_quote($needle, '/') . '/us', $haystack, $matches, \PREG_SET_ORDER); return \count($matches); } /** * Count the number of substring occurrences. * * @param string $haystack

* The string being checked. *

* @param string $needle

* The string being found. *

* @param int $offset [optional]

* The offset where to start counting *

* @param int|null $length [optional]

* The maximum length after the specified offset to search for the * substring. It outputs a warning if the offset plus the length is * greater than the haystack length. *

* * @psalm-pure * * @return false|int *

The number of times the * needle substring occurs in the * haystack string.

*/ public static function substr_count_in_byte( string $haystack, string $needle, int $offset = 0, int $length = null ) { if ($haystack === '' || $needle === '') { return 0; } if ( ($offset || $length !== null) && self::$SUPPORT['mbstring_func_overload'] === true ) { if ($length === null) { $length_tmp = self::strlen($haystack); if ($length_tmp === false) { return false; } $length = $length_tmp; } if ( ( $length !== 0 && $offset !== 0 ) && ($length + $offset) <= 0 && \PHP_VERSION_ID < 71000 // output from "substr_count()" have changed in PHP 7.1 ) { return false; } /** @var false|string $haystack_tmp - needed for PhpStan (stubs error) */ $haystack_tmp = \substr($haystack, $offset, $length); if ($haystack_tmp === false) { $haystack_tmp = ''; } $haystack = (string) $haystack_tmp; } if (self::$SUPPORT['mbstring_func_overload'] === true) { // "mb_" is available if overload is used, so use it ... return \mb_substr_count($haystack, $needle, 'CP850'); // 8-BIT } if ($length === null) { return \substr_count($haystack, $needle, $offset); } return \substr_count($haystack, $needle, $offset, $length); } /** * Returns the number of occurrences of $substring in the given string. * By default, the comparison is case-sensitive, but can be made insensitive * by setting $case_sensitive to false. * * @param string $str

The input string.

* @param string $substring

The substring to search for.

* @param bool $case_sensitive [optional]

Whether or not to enforce case-sensitivity. Default: true

* @param string $encoding [optional]

Set the charset for e.g. "mb_" function

* * @psalm-pure * * @return int * * @phpstan-return 0|positive-int */ public static function substr_count_simple( string $str, string $substring, bool $case_sensitive = true, string $encoding = 'UTF-8' ): int { if ($str === '' || $substring === '') { return 0; } if ($encoding === 'UTF-8') { if ($case_sensitive) { return (int) \mb_substr_count($str, $substring); } return (int) \mb_substr_count( \mb_strtoupper($str), \mb_strtoupper($substring) ); } $encoding = self::normalize_encoding($encoding, 'UTF-8'); if ($case_sensitive) { return (int) \mb_substr_count($str, $substring, $encoding); } return (int) \mb_substr_count( self::strtocasefold($str, true, false, $encoding, null, false), self::strtocasefold($substring, true, false, $encoding, null, false), $encoding ); } /** * Removes a prefix ($needle) from the beginning of the string ($haystack), case-insensitive. * * EXMAPLE:


     * UTF8::substr_ileft('ΚόσμεMiddleEnd', 'Κόσμε'); // 'MiddleEnd'
     * UTF8::substr_ileft('ΚόσμεMiddleEnd', 'κόσμε'); // 'MiddleEnd'
     *

* * @param string $haystack

The string to search in.

* @param string $needle

The substring to search for.

* * @psalm-pure * * @return string *

Return the sub-string.

*/ public static function substr_ileft(string $haystack, string $needle): string { if ($haystack === '') { return ''; } if ($needle === '') { return $haystack; } if (self::str_istarts_with($haystack, $needle)) { $haystack = (string) \mb_substr($haystack, (int) self::strlen($needle)); } return $haystack; } /** * Get part of a string process in bytes. * * @param string $str

The string being checked.

* @param int $offset

The first position used in str.

* @param int|null $length [optional]

The maximum length of the returned string.

* * @psalm-pure * * @return false|string *

The portion of str specified by the offset and * length parameters.

If str is shorter than offset * characters long, FALSE will be returned.

*/ public static function substr_in_byte(string $str, int $offset = 0, int $length = null) { // empty string if ($str === '' || $length === 0) { return ''; } // whole string if (!$offset && $length === null) { return $str; } if (self::$SUPPORT['mbstring_func_overload'] === true) { // "mb_" is available if overload is used, so use it ... return \mb_substr($str, $offset, $length, 'CP850'); // 8-BIT } return \substr($str, $offset, $length ?? 2147483647); } /** * Removes a suffix ($needle) from the end of the string ($haystack), case-insensitive. * * EXAMPLE:


     * UTF8::substr_iright('BeginMiddleΚόσμε', 'Κόσμε'); // 'BeginMiddle'
     * UTF8::substr_iright('BeginMiddleΚόσμε', 'κόσμε'); // 'BeginMiddle'
     *

* * @param string $haystack

The string to search in.

* @param string $needle

The substring to search for.

* * @psalm-pure * * @return string *

Return the sub-string.

*/ public static function substr_iright(string $haystack, string $needle): string { if ($haystack === '') { return ''; } if ($needle === '') { return $haystack; } if (self::str_iends_with($haystack, $needle)) { $haystack = (string) \mb_substr($haystack, 0, (int) self::strlen($haystack) - (int) self::strlen($needle)); } return $haystack; } /** * Removes a prefix ($needle) from the beginning of the string ($haystack). * * EXAMPLE: * UTF8::substr_left('ΚόσμεMiddleEnd', 'Κόσμε'); // 'MiddleEnd' * UTF8::substr_left('ΚόσμεMiddleEnd', 'κόσμε'); // 'ΚόσμεMiddleEnd' * * * @param string $haystack

The string to search in.

* @param string $needle

The substring to search for.

* * @psalm-pure * * @return string *

Return the sub-string.

*/ public static function substr_left(string $haystack, string $needle): string { if ($haystack === '') { return ''; } if ($needle === '') { return $haystack; } if (self::str_starts_with($haystack, $needle)) { $haystack = (string) \mb_substr($haystack, (int) self::strlen($needle)); } return $haystack; } /** * Replace text within a portion of a string. * * EXAMPLE:

UTF8::substr_replace(array('Iñtërnâtiônàlizætiøn', 'foo'), 'æ', 1); // array('Iæñtërnâtiônàlizætiøn', 'fæoo')

* * source: https://gist.github.com/stemar/8287074 * * @param string|string[] $str

The input string or an array of stings.

* @param string|string[] $replacement

The replacement string or an array of stings.

* @param int|int[] $offset

* If start is positive, the replacing will begin at the start'th offset * into string. *

* If start is negative, the replacing will begin at the start'th character * from the end of string. *

* @param int|int[]|null $length [optional]

If given and is positive, it represents the length of the * portion of string which is to be replaced. If it is negative, it * represents the number of characters from the end of string at which to * stop replacing. If it is not given, then it will default to strlen( * string ); i.e. end the replacing at the end of string. Of course, if * length is zero then this function will have the effect of inserting * replacement into string at the given start offset.

* @param string $encoding [optional]

Set the charset for e.g. "mb_" function

* * @psalm-pure * * @return string|string[] *

The result string is returned. If string is an array then array is returned.

* * @template TSubstrReplace string|string[] * @phpstan-param TSubstrReplace $str * @phpstan-return TSubstrReplace */ public static function substr_replace( $str, $replacement, $offset, $length = null, string $encoding = 'UTF-8' ) { if (\is_array($str)) { $num = \count($str); // the replacement if (\is_array($replacement)) { $replacement = \array_slice($replacement, 0, $num); } else { $replacement = \array_pad([$replacement], $num, $replacement); } // the offset if (\is_array($offset)) { $offset = \array_slice($offset, 0, $num); foreach ($offset as &$value_tmp) { $value_tmp = (int) $value_tmp === $value_tmp ? $value_tmp : 0; } unset($value_tmp); } else { $offset = \array_pad([$offset], $num, $offset); } // the length if ($length === null) { $length = \array_fill(0, $num, 0); } elseif (\is_array($length)) { $length = \array_slice($length, 0, $num); foreach ($length as &$value_tmp_V2) { $value_tmp_V2 = (int) $value_tmp_V2 === $value_tmp_V2 ? $value_tmp_V2 : $num; } unset($value_tmp_V2); } else { $length = \array_pad([$length], $num, $length); } // recursive call /** @phpstan-ignore-next-line - phpstan currently can't handle recursive calls */ return \array_map([self::class, 'substr_replace'], $str, $replacement, $offset, $length); } if (\is_array($replacement)) { if ($replacement !== []) { $replacement = $replacement[0]; } else { $replacement = ''; } } // init $str = (string) $str; $replacement = (string) $replacement; if (\is_array($length)) { throw new \InvalidArgumentException('Parameter "$length" can only be an array, if "$str" is also an array.'); } if (\is_array($offset)) { throw new \InvalidArgumentException('Parameter "$offset" can only be an array, if "$str" is also an array.'); } if ($str === '') { return $replacement; } if (self::$SUPPORT['mbstring'] === true) { $string_length = (int) self::strlen($str, $encoding); if ($offset < 0) { $offset = (int) \max(0, $string_length + $offset); } elseif ($offset > $string_length) { $offset = $string_length; } if ($length !== null && $length < 0) { $length = (int) \max(0, $string_length - $offset + $length); } elseif ($length === null || $length > $string_length) { $length = $string_length; } if (($offset + $length) > $string_length) { $length = $string_length - $offset; } return ((string) \mb_substr($str, 0, $offset, $encoding)) . $replacement . ((string) \mb_substr($str, $offset + $length, $string_length - $offset - $length, $encoding)); } // // fallback for ascii only // if (ASCII::is_ascii($str)) { return ($length === null) ? \substr_replace($str, $replacement, $offset) : \substr_replace($str, $replacement, $offset, $length); } // // fallback via vanilla php // \preg_match_all('/./us', $str, $str_matches); \preg_match_all('/./us', $replacement, $replacement_matches); if ($length === null) { $length_tmp = self::strlen($str, $encoding); if ($length_tmp === false) { // e.g.: non mbstring support + invalid chars return ''; } $length = $length_tmp; } \array_splice($str_matches[0], $offset, $length, $replacement_matches[0]); return \implode('', $str_matches[0]); } /** * Removes a suffix ($needle) from the end of the string ($haystack). * * EXAMPLE:


     * UTF8::substr_right('BeginMiddleΚόσμε', 'Κόσμε'); // 'BeginMiddle'
     * UTF8::substr_right('BeginMiddleΚόσμε', 'κόσμε'); // 'BeginMiddleΚόσμε'
     *

* * @param string $haystack

The string to search in.

* @param string $needle

The substring to search for.

* @param string $encoding [optional]

Set the charset for e.g. "mb_" function

* * @psalm-pure * * @return string *

Return the sub-string.

*/ public static function substr_right( string $haystack, string $needle, string $encoding = 'UTF-8' ): string { if ($haystack === '') { return ''; } if ($needle === '') { return $haystack; } if ( $encoding === 'UTF-8' && \substr($haystack, -\strlen($needle)) === $needle ) { return (string) \mb_substr($haystack, 0, (int) \mb_strlen($haystack) - (int) \mb_strlen($needle)); } if (\substr($haystack, -\strlen($needle)) === $needle) { return (string) self::substr( $haystack, 0, (int) self::strlen($haystack, $encoding) - (int) self::strlen($needle, $encoding), $encoding ); } return $haystack; } /** * Returns a case swapped version of the string. * * EXAMPLE: UTF8::swapCase('déJÀ σσς iıII'); // 'DÉjà ΣΣΣ IIii' * * @param string $str

The input string.

* @param string $encoding [optional]

Set the charset for e.g. "mb_" function

* @param bool $clean_utf8 [optional]

Remove non UTF-8 chars from the string.

* * @psalm-pure * * @return string *

Each character's case swapped.

*/ public static function swapCase(string $str, string $encoding = 'UTF-8', bool $clean_utf8 = false): string { if ($str === '') { return ''; } if ($clean_utf8) { // "mb_strpos()" and "iconv_strpos()" returns wrong position, // if invalid characters are found in $haystack before $needle $str = self::clean($str); } if ($encoding === 'UTF-8') { return (string) (\mb_strtolower($str) ^ \mb_strtoupper($str) ^ $str); } return (string) (self::strtolower($str, $encoding) ^ self::strtoupper($str, $encoding) ^ $str); } /** * Checks whether symfony-polyfills are used. * * @psalm-pure * * @return bool *

true if in use, false otherwise

* * @internal

Please do not use it anymore, we will make is private in next major version.

*/ public static function symfony_polyfill_used(): bool { // init $return = false; $return_tmp = \extension_loaded('mbstring'); if (!$return_tmp && \function_exists('mb_strlen')) { $return = true; } $return_tmp = \extension_loaded('iconv'); if (!$return_tmp && \function_exists('iconv')) { $return = true; } return $return; } /** * @param string $str * @param int $tab_length * * @psalm-pure * * @return string */ public static function tabs_to_spaces(string $str, int $tab_length = 4): string { if ($tab_length === 4) { $spaces = ' '; } elseif ($tab_length === 2) { $spaces = ' '; } else { $spaces = \str_repeat(' ', $tab_length); } return \str_replace("\t", $spaces, $str); } /** * Converts the first character of each word in the string to uppercase * and all other chars to lowercase. * * @param string $str

The input string.

* @param string $encoding [optional]

Set the charset for e.g. "mb_" function

* @param bool $clean_utf8 [optional]

Remove non UTF-8 chars from the string.

* @param string|null $lang [optional]

Set the language for special cases: az, el, lt, * tr

* @param bool $try_to_keep_the_string_length [optional]

true === try to keep the string length: e.g. ẞ * -> ß

* * @psalm-pure * * @return string *

A string with all characters of $str being title-cased.

*/ public static function titlecase( string $str, string $encoding = 'UTF-8', bool $clean_utf8 = false, string $lang = null, bool $try_to_keep_the_string_length = false ): string { if ($clean_utf8) { // "mb_strpos()" and "iconv_strpos()" returns wrong position, // if invalid characters are found in $haystack before $needle $str = self::clean($str); } if ( $lang === null && !$try_to_keep_the_string_length ) { if ($encoding === 'UTF-8') { return \mb_convert_case($str, \MB_CASE_TITLE); } $encoding = self::normalize_encoding($encoding, 'UTF-8'); return \mb_convert_case($str, \MB_CASE_TITLE, $encoding); } return self::str_titleize( $str, null, $encoding, false, $lang, $try_to_keep_the_string_length, false ); } /** * Convert a string into ASCII. * * EXAMPLE: UTF8::to_ascii('déjà σσς iıii'); // 'deja sss iiii' * * @param string $str

The input string.

* @param string $unknown [optional]

Character use if character unknown. (default is ?)

* @param bool $strict [optional]

Use "transliterator_transliterate()" from PHP-Intl | WARNING: bad * performance

* * @psalm-pure * * @return string */ public static function to_ascii( string $str, string $unknown = '?', bool $strict = false ): string { return ASCII::to_transliterate($str, $unknown, $strict); } /** * @param bool|float|int|string $str * * @psalm-pure * * @return bool */ public static function to_boolean($str): bool { // init $str = (string) $str; if ($str === '') { return false; } // Info: http://php.net/manual/en/filter.filters.validate.php $map = [ 'true' => true, '1' => true, 'on' => true, 'yes' => true, 'false' => false, '0' => false, 'off' => false, 'no' => false, ]; if (isset($map[$str])) { return $map[$str]; } $key = \strtolower($str); if (isset($map[$key])) { return $map[$key]; } if (\is_numeric($str)) { return ((float) $str) > 0; } return (bool) \trim($str); } /** * Convert given string to safe filename (and keep string case). * * @param string $str * @param bool $use_transliterate No transliteration, conversion etc. is done by default - unsafe characters are * simply replaced with hyphen. * @param string $fallback_char * * @psalm-pure * * @return string */ public static function to_filename( string $str, bool $use_transliterate = false, string $fallback_char = '-' ): string { return ASCII::to_filename( $str, $use_transliterate, $fallback_char ); } /** * Convert a string into "ISO-8859"-encoding (Latin-1). * * EXAMPLE: UTF8::to_utf8(UTF8::to_iso8859(' -ABC-中文空白- ')); // ' -ABC-????- ' * * @param string|string[] $str * * @psalm-pure * * @return string|string[] * * @template TToIso8859 as string|string[] * @phpstan-param TToIso8859 $str * @phpstan-return (TToIso8859 is string ? string : string[]) */ public static function to_iso8859($str) { if (\is_array($str)) { foreach ($str as &$v) { $v = self::to_iso8859($v); } return $str; } /* @phpstan-ignore-next-line | FP? -> "Cannot cast TToIso8859 of array|string to string." it's a string here */ $str = (string) $str; if ($str === '') { return ''; } return self::utf8_decode($str); } /** * This function leaves UTF-8 characters alone, while converting almost all non-UTF8 to UTF8. * *

It decode UTF-8 codepoints and Unicode escape sequences.
It assumes that the encoding of the original string is either WINDOWS-1252 or ISO-8859.
WARNING: It does not remove invalid UTF-8 characters, so you maybe need to use "UTF8::clean()" for this * case.

* * EXAMPLE: UTF8::to_utf8(["\u0063\u0061\u0074"]); // array('cat') * * @param string|string[] $str

Any string or array of strings.

* @param bool $decode_html_entity_to_utf8

Set to true, if you need to decode html-entities.

* * @psalm-pure * * @return string|string[] *

The UTF-8 encoded string

* * @template TToUtf8 as string|string[] * @phpstan-param TToUtf8 $str * @phpstan-return (TToUtf8 is string ? string : string[]) */ public static function to_utf8($str, bool $decode_html_entity_to_utf8 = false) { if (\is_array($str)) { foreach ($str as &$v) { $v = self::to_utf8_string($v, $decode_html_entity_to_utf8); } /** @phpstan-var TToUtf8 $str */ return $str; } \assert(\is_string($str)); $str = self::to_utf8_string($str, $decode_html_entity_to_utf8); /** @phpstan-var TToUtf8 $str */ return $str; } /** * This function leaves UTF-8 characters alone, while converting almost all non-UTF8 to UTF8. * *

It decode UTF-8 codepoints and Unicode escape sequences.
It assumes that the encoding of the original string is either WINDOWS-1252 or ISO-8859.
WARNING: It does not remove invalid UTF-8 characters, so you maybe need to use "UTF8::clean()" for this * case.

* * EXAMPLE: UTF8::to_utf8_string("\u0063\u0061\u0074"); // 'cat' * * @param string $str

Any string.

* @param bool $decode_html_entity_to_utf8

Set to true, if you need to decode html-entities.

* * @psalm-pure * * @return string *

The UTF-8 encoded string

* * @template T as string * @phpstan-param T $str * @phpstan-return (T is non-empty-string ? non-empty-string : string) */ public static function to_utf8_string(string $str, bool $decode_html_entity_to_utf8 = false): string { if ($str === '') { return $str; } $max = \strlen($str); $buf = ''; for ($i = 0; $i < $max; ++$i) { $c1 = $str[$i]; if ($c1 >= "\xC0") { // should be converted to UTF8, if it's not UTF8 already if ($c1 <= "\xDF") { // looks like 2 bytes UTF8 $c2 = $i + 1 >= $max ? "\x00" : $str[$i + 1]; if ($c2 >= "\x80" && $c2 <= "\xBF") { // yeah, almost sure it's UTF8 already $buf .= $c1 . $c2; ++$i; } else { // not valid UTF8 - convert it $buf .= self::to_utf8_convert_helper($c1); } } elseif ($c1 >= "\xE0" && $c1 <= "\xEF") { // looks like 3 bytes UTF8 $c2 = $i + 1 >= $max ? "\x00" : $str[$i + 1]; $c3 = $i + 2 >= $max ? "\x00" : $str[$i + 2]; if ($c2 >= "\x80" && $c2 <= "\xBF" && $c3 >= "\x80" && $c3 <= "\xBF") { // yeah, almost sure it's UTF8 already $buf .= $c1 . $c2 . $c3; $i += 2; } else { // not valid UTF8 - convert it $buf .= self::to_utf8_convert_helper($c1); } } elseif ($c1 >= "\xF0" && $c1 <= "\xF7") { // looks like 4 bytes UTF8 $c2 = $i + 1 >= $max ? "\x00" : $str[$i + 1]; $c3 = $i + 2 >= $max ? "\x00" : $str[$i + 2]; $c4 = $i + 3 >= $max ? "\x00" : $str[$i + 3]; if ($c2 >= "\x80" && $c2 <= "\xBF" && $c3 >= "\x80" && $c3 <= "\xBF" && $c4 >= "\x80" && $c4 <= "\xBF") { // yeah, almost sure it's UTF8 already $buf .= $c1 . $c2 . $c3 . $c4; $i += 3; } else { // not valid UTF8 - convert it $buf .= self::to_utf8_convert_helper($c1); } } else { // doesn't look like UTF8, but should be converted $buf .= self::to_utf8_convert_helper($c1); } } elseif (($c1 & "\xC0") === "\x80") { // needs conversion $buf .= self::to_utf8_convert_helper($c1); } else { // it doesn't need conversion $buf .= $c1; } } // decode unicode escape sequences + unicode surrogate pairs $buf = \preg_replace_callback( '/\\\\u([dD][89abAB][0-9a-fA-F]{2})\\\\u([dD][cdefCDEF][\da-fA-F]{2})|\\\\u([0-9a-fA-F]{4})/', /** * @param array $matches * * @psalm-pure * * @return string */ static function (array $matches): string { if (isset($matches[3])) { $cp = (int) \hexdec($matches[3]); } else { // http://unicode.org/faq/utf_bom.html#utf16-4 $cp = ((int) \hexdec($matches[1]) << 10) + (int) \hexdec($matches[2]) + 0x10000 - (0xD800 << 10) - 0xDC00; } // https://github.com/php/php-src/blob/php-7.3.2/ext/standard/html.c#L471 // // php_utf32_utf8(unsigned char *buf, unsigned k) if ($cp < 0x80) { return (string) self::chr($cp); } if ($cp < 0xA0) { /** @noinspection UnnecessaryCastingInspection */ return (string) self::chr(0xC0 | $cp >> 6) . (string) self::chr(0x80 | $cp & 0x3F); } return self::decimal_to_chr($cp); }, $buf ); if ($buf === null) { return ''; } // decode UTF-8 codepoints if ($decode_html_entity_to_utf8) { $buf = self::html_entity_decode($buf); } return $buf; } /** * Returns the given string as an integer, or null if the string isn't numeric. * * @param string $str * * @psalm-pure * * @return int|null *

null if the string isn't numeric

*/ public static function to_int(string $str) { if (\is_numeric($str)) { return (int) $str; } return null; } /** * Returns the given input as string, or null if the input isn't int|float|string * and do not implement the "__toString()" method. * * @param float|int|object|string|null $input * * @psalm-pure * * @return string|null *

null if the input isn't int|float|string and has no "__toString()" method

*/ public static function to_string($input) { if ($input === null) { return null; } $input_type = \gettype($input); if ( $input_type === 'string' || $input_type === 'integer' || $input_type === 'float' || $input_type === 'double' ) { return (string) $input; } /** @phpstan-ignore-next-line - "gettype": FP? */ if ($input_type === 'object' && \method_exists($input, '__toString')) { return (string) $input; } return null; } /** * Strip whitespace or other characters from the beginning and end of a UTF-8 string. * * INFO: This is slower then "trim()" * * We can only use the original-function, if we use <= 7-Bit in the string / chars * but the check for ASCII (7-Bit) cost more time, then we can safe here. * * EXAMPLE: UTF8::trim(' -ABC-中文空白- '); // '-ABC-中文空白-' * * @param string $str

The string to be trimmed

* @param string|null $chars [optional]

Optional characters to be stripped

* * @psalm-pure * * @return string *

The trimmed string.

*/ public static function trim(string $str = '', string $chars = null): string { if ($str === '') { return ''; } if (self::$SUPPORT['mbstring'] === true) { if ($chars !== null) { /** @noinspection PregQuoteUsageInspection */ $chars = \preg_quote($chars); $pattern = "^[{$chars}]+|[{$chars}]+\$"; } else { $pattern = '^[\\s]+|[\\s]+$'; } return (string) \mb_ereg_replace($pattern, '', $str); } if ($chars !== null) { $chars = \preg_quote($chars, '/'); $pattern = "^[{$chars}]+|[{$chars}]+\$"; } else { $pattern = '^[\\s]+|[\\s]+$'; } return self::regex_replace($str, $pattern, ''); } /** * Makes string's first char uppercase. * * EXAMPLE: UTF8::ucfirst('ñtërnâtiônàlizætiøn foo'); // 'Ñtërnâtiônàlizætiøn foo' * * @param string $str

The input string.

* @param string $encoding [optional]

Set the charset for e.g. "mb_" function

* @param bool $clean_utf8 [optional]

Remove non UTF-8 chars from the string.

* @param string|null $lang [optional]

Set the language for special cases: az, el, lt, * tr

* @param bool $try_to_keep_the_string_length [optional]

true === try to keep the string length: e.g. ẞ * -> ß

* * @psalm-pure * * @return string *

The resulting string with with char uppercase.

*/ public static function ucfirst( string $str, string $encoding = 'UTF-8', bool $clean_utf8 = false, string $lang = null, bool $try_to_keep_the_string_length = false ): string { if ($str === '') { return ''; } if ($clean_utf8) { // "mb_strpos()" and "iconv_strpos()" returns wrong position, // if invalid characters are found in $haystack before $needle $str = self::clean($str); } $use_mb_functions = $lang === null && !$try_to_keep_the_string_length; if ($encoding === 'UTF-8') { $str_part_two = (string) \mb_substr($str, 1); if ($use_mb_functions) { $str_part_one = \mb_strtoupper( (string) \mb_substr($str, 0, 1) ); } else { $str_part_one = self::strtoupper( (string) \mb_substr($str, 0, 1), $encoding, false, $lang, $try_to_keep_the_string_length ); } } else { $encoding = self::normalize_encoding($encoding, 'UTF-8'); $str_part_two = (string) self::substr($str, 1, null, $encoding); if ($use_mb_functions) { $str_part_one = \mb_strtoupper( (string) \mb_substr($str, 0, 1, $encoding), $encoding ); } else { $str_part_one = self::strtoupper( (string) self::substr($str, 0, 1, $encoding), $encoding, false, $lang, $try_to_keep_the_string_length ); } } return $str_part_one . $str_part_two; } /** * Uppercase for all words in the string. * * EXAMPLE: UTF8::ucwords('iñt ërn âTi ônà liz æti øn'); // 'Iñt Ërn ÂTi Ônà Liz Æti Øn' * * @param string $str

The input string.

* @param string[] $exceptions [optional]

Exclusion for some words.

* @param string $char_list [optional]

Additional chars that contains to words and do not start a new * word.

* @param string $encoding [optional]

Set the charset.

* @param bool $clean_utf8 [optional]

Remove non UTF-8 chars from the string.

* * @psalm-pure * * @return string */ public static function ucwords( string $str, array $exceptions = [], string $char_list = '', string $encoding = 'UTF-8', bool $clean_utf8 = false ): string { if (!$str) { return ''; } // INFO: mb_convert_case($str, MB_CASE_TITLE); // -> MB_CASE_TITLE didn't only uppercase the first letter, it also lowercase all other letters if ($clean_utf8) { // "mb_strpos()" and "iconv_strpos()" returns wrong position, // if invalid characters are found in $haystack before $needle $str = self::clean($str); } $use_php_default_functions = !(bool) ($char_list . \implode('', $exceptions)); if ( $use_php_default_functions && ASCII::is_ascii($str) ) { return \ucwords($str); } $words = self::str_to_words($str, $char_list); $use_exceptions = $exceptions !== []; $words_str = ''; foreach ($words as &$word) { if (!$word) { continue; } if ( !$use_exceptions || !\in_array($word, $exceptions, true) ) { $words_str .= self::ucfirst($word, $encoding); } else { $words_str .= $word; } } return $words_str; } /** * Multi decode HTML entity + fix urlencoded-win1252-chars. * * EXAMPLE: UTF8::urldecode('tes%20öäü%20\u00edtest+test'); // 'tes öäü ítest test' * * e.g: * 'test+test' => 'test test' * 'Düsseldorf' => 'Düsseldorf' * 'D%FCsseldorf' => 'Düsseldorf' * 'Düsseldorf' => 'Düsseldorf' * 'D%26%23xFC%3Bsseldorf' => 'Düsseldorf' * 'DÃ¼sseldorf' => 'Düsseldorf' * 'D%C3%BCsseldorf' => 'Düsseldorf' * 'D%C3%83%C2%BCsseldorf' => 'Düsseldorf' * 'D%25C3%2583%25C2%25BCsseldorf' => 'Düsseldorf' * * @param string $str

The input string.

* @param bool $multi_decode

Decode as often as possible.

* * @psalm-pure * * @return string * * @template T as string * @phpstan-param T $str * @phpstan-return (T is non-empty-string ? non-empty-string : string) */ public static function urldecode(string $str, bool $multi_decode = true): string { if ($str === '') { return ''; } $str = self::urldecode_unicode_helper($str); if ($multi_decode) { do { $str_compare = $str; /** * @psalm-suppress PossiblyInvalidArgument */ $str = \urldecode( self::html_entity_decode( self::to_utf8($str), \ENT_QUOTES | \ENT_HTML5 ) ); } while ($str_compare !== $str); } else { /** * @psalm-suppress PossiblyInvalidArgument */ $str = \urldecode( self::html_entity_decode( self::to_utf8($str), \ENT_QUOTES | \ENT_HTML5 ) ); } return self::fix_simple_utf8($str); } /** * Decodes a UTF-8 string to ISO-8859-1. * * EXAMPLE: UTF8::encode('UTF-8', UTF8::utf8_decode('-ABC-中文空白-')); // '-ABC-????-' * * @param string $str

The input string.

* @param bool $keep_utf8_chars * * @psalm-pure * * @return string */ public static function utf8_decode(string $str, bool $keep_utf8_chars = false): string { if ($str === '') { return ''; } // save for later comparision $str_backup = $str; $len = \strlen($str); if (self::$ORD === null) { self::$ORD = self::getData('ord'); } if (self::$CHR === null) { self::$CHR = self::getData('chr'); } $no_char_found = '?'; for ($i = 0, $j = 0; $i < $len; ++$i, ++$j) { switch ($str[$i] & "\xF0") { case "\xC0": case "\xD0": $c = (self::$ORD[$str[$i] & "\x1F"] << 6) | self::$ORD[$str[++$i] & "\x3F"]; $str[$j] = $c < 256 ? self::$CHR[$c] : $no_char_found; break; case "\xF0": ++$i; // no break case "\xE0": $str[$j] = $no_char_found; $i += 2; break; default: $str[$j] = $str[$i]; } } /** @var false|string $return - needed for PhpStan (stubs error) */ $return = \substr($str, 0, $j); if ($return === false) { $return = ''; } if ( $keep_utf8_chars && (int) self::strlen($return) >= (int) self::strlen($str_backup) ) { return $str_backup; } return $return; } /** * Encodes an ISO-8859-1 string to UTF-8. * * EXAMPLE: UTF8::utf8_decode(UTF8::utf8_encode('-ABC-中文空白-')); // '-ABC-中文空白-' * * @param string $str

The input string.

* * @psalm-pure * * @return string */ public static function utf8_encode(string $str): string { if ($str === '') { return ''; } /** @noinspection PhpUsageOfSilenceOperatorInspection | TODO for PHP > 8.2: find a replacement for this */ /** @var false|string $str - the polyfill maybe return false */ $str = @\utf8_encode($str); if ($str === false) { return ''; } return $str; } /** * Returns an array with all utf8 whitespace characters. * * @see http://www.bogofilter.org/pipermail/bogofilter/2003-March/001889.html * * @psalm-pure * * @return string[] * An array with all known whitespace characters as values and the type of whitespace as keys * as defined in above URL */ public static function whitespace_table(): array { return self::$WHITESPACE_TABLE; } /** * Limit the number of words in a string. * * EXAMPLE: UTF8::words_limit('fòô bàř fòô', 2, ''); // 'fòô bàř' * * @param string $str

The input string.

* @param int<1, max> $limit

The limit of words as integer.

* @param string $str_add_on

Replacement for the striped string.

* * @psalm-pure * * @return string */ public static function words_limit( string $str, int $limit = 100, string $str_add_on = '…' ): string { if ( $str === '' || /* @phpstan-ignore-next-line | we do not trust the phpdoc check */ $limit <= 0 ) { return ''; } \preg_match('/^\\s*+(?:[^\\s]++\\s*+){1,' . $limit . '}/u', $str, $matches); if ( !isset($matches[0]) || \mb_strlen($str) === (int) \mb_strlen($matches[0]) ) { return $str; } return \rtrim($matches[0]) . $str_add_on; } /** * Wraps a string to a given number of characters * * EXAMPLE:

UTF8::wordwrap('Iñtërnâtiônàlizætiøn', 2, '
', true)); // 'Iñ
të
rn
ât
iô
nà
li
zæ
ti
øn'

* * @see http://php.net/manual/en/function.wordwrap.php * * @param string $str

The input string.

* @param int<1, max> $width [optional]

The column width.

* @param string $break [optional]

The line is broken using the optional break parameter.

* @param bool $cut [optional]

* If the cut is set to true, the string is * always wrapped at or before the specified width. So if you have * a word that is larger than the given width, it is broken apart. *

* * @psalm-pure * * @return string *

The given string wrapped at the specified column.

*/ public static function wordwrap( string $str, int $width = 75, string $break = "\n", bool $cut = false ): string { if ($str === '' || $break === '') { return ''; } $str_split = \explode($break, $str); /** @var string[] $charsArray */ $charsArray = []; $word_split = ''; foreach ($str_split as $i => $i_value) { if ($i) { $charsArray[] = $break; $word_split .= '#'; } foreach (self::str_split($i_value) as $c) { $charsArray[] = $c; if ($c === ' ') { $word_split .= ' '; } else { $word_split .= '?'; } } } $str_return = ''; $j = 0; $b = -1; $i = -1; $word_split = \wordwrap($word_split, $width, '#', $cut); $max = \mb_strlen($word_split); /** @noinspection PhpAssignmentInConditionInspection - is ok here */ while (($b = \mb_strpos($word_split, '#', $b + 1)) !== false) { for (++$i; $i < $b; ++$i) { if (isset($charsArray[$j])) { $str_return .= $charsArray[$j]; unset($charsArray[$j]); } ++$j; // prevent endless loop, e.g. if there is a error in the "mb_*" polyfill if ($i > $max) { break 2; } } if ( $break === $charsArray[$j] || $charsArray[$j] === ' ' ) { unset($charsArray[$j++]); } $str_return .= $break; // prevent endless loop, e.g. if there is a error in the "mb_*" polyfill if ($b > $max) { break; } } return $str_return . \implode('', $charsArray); } /** * Line-Wrap the string after $limit, but split the string by "$delimiter" before ... * ... so that we wrap the per line. * * @param string $str

The input string.

* @param int<1, max> $width [optional]

The column width.

* @param string $break [optional]

The line is broken using the optional break parameter.

* @param bool $cut [optional]

* If the cut is set to true, the string is * always wrapped at or before the specified width. So if you have * a word that is larger than the given width, it is broken apart. *

* @param bool $add_final_break [optional]

* If this flag is true, then the method will add a $break at the end * of the result string. *

* @param non-empty-string|null $delimiter [optional]

* You can change the default behavior, where we split the string by newline. *

* * @psalm-pure * * @return string */ public static function wordwrap_per_line( string $str, int $width = 75, string $break = "\n", bool $cut = false, bool $add_final_break = true, string $delimiter = null ): string { if ($delimiter === null) { $strings = \preg_split('/\\r\\n|\\r|\\n/', $str); } else { $strings = \explode($delimiter, $str); } $string_helper_array = []; if ($strings !== false) { foreach ($strings as $value) { $string_helper_array[] = self::wordwrap($value, $width, $break, $cut); } } if ($add_final_break) { $final_break = $break; } else { $final_break = ''; } return \implode($delimiter ?? "\n", $string_helper_array) . $final_break; } /** * Returns an array of Unicode White Space characters. * * @psalm-pure * * @return string[] *

An array with numeric code point as key and White Space Character as value.

*/ public static function ws(): array { return self::$WHITESPACE; } /** * Checks whether the passed string contains only byte sequences that are valid UTF-8 characters. * * EXAMPLE:


     * UTF8::is_utf8_string('Iñtërnâtiônàlizætiøn']); // true
     * //
     * UTF8::is_utf8_string("Iñtërnâtiônàlizætiøn\xA0\xA1"); // false
     *

* * @see http://hsivonen.iki.fi/php-utf8/ * * @param string $str

The string to be checked.

* @param bool $strict

Check also if the string is not UTF-16 or UTF-32.

* * @psalm-pure * * @return bool */ private static function is_utf8_string(string $str, bool $strict = false) { if ($str === '') { return true; } if ($strict) { $is_binary = self::is_binary($str, true); if ($is_binary && self::is_utf16($str, false) !== false) { return false; } if ($is_binary && self::is_utf32($str, false) !== false) { return false; } } if (self::$SUPPORT['pcre_utf8']) { // If even just the first character can be matched, when the /u // modifier is used, then it's valid UTF-8. If the UTF-8 is somehow // invalid, nothing at all will match, even if the string contains // some valid sequences return \preg_match('/^./us', $str) === 1; } $mState = 0; // cached expected number of octets after the current octet // until the beginning of the next UTF8 character sequence $mUcs4 = 0; // cached Unicode character $mBytes = 1; // cached expected number of octets in the current sequence if (self::$ORD === null) { self::$ORD = self::getData('ord'); } $len = \strlen($str); for ($i = 0; $i < $len; ++$i) { $in = self::$ORD[$str[$i]]; if ($mState === 0) { // When mState is zero we expect either a US-ASCII character or a // multi-octet sequence. if ((0x80 & $in) === 0) { // US-ASCII, pass straight through. $mBytes = 1; } elseif ((0xE0 & $in) === 0xC0) { // First octet of 2 octet sequence. $mUcs4 = $in; $mUcs4 = ($mUcs4 & 0x1F) << 6; $mState = 1; $mBytes = 2; } elseif ((0xF0 & $in) === 0xE0) { // First octet of 3 octet sequence. $mUcs4 = $in; $mUcs4 = ($mUcs4 & 0x0F) << 12; $mState = 2; $mBytes = 3; } elseif ((0xF8 & $in) === 0xF0) { // First octet of 4 octet sequence. $mUcs4 = $in; $mUcs4 = ($mUcs4 & 0x07) << 18; $mState = 3; $mBytes = 4; } elseif ((0xFC & $in) === 0xF8) { /* First octet of 5 octet sequence. * * This is illegal because the encoded codepoint must be either * (a) not the shortest form or * (b) outside the Unicode range of 0-0x10FFFF. * Rather than trying to resynchronize, we will carry on until the end * of the sequence and let the later error handling code catch it. */ $mUcs4 = $in; $mUcs4 = ($mUcs4 & 0x03) << 24; $mState = 4; $mBytes = 5; } elseif ((0xFE & $in) === 0xFC) { // First octet of 6 octet sequence, see comments for 5 octet sequence. $mUcs4 = $in; $mUcs4 = ($mUcs4 & 1) << 30; $mState = 5; $mBytes = 6; } else { // Current octet is neither in the US-ASCII range nor a legal first // octet of a multi-octet sequence. return false; } } elseif ((0xC0 & $in) === 0x80) { // When mState is non-zero, we expect a continuation of the multi-octet // sequence // Legal continuation. $shift = ($mState - 1) * 6; $tmp = $in; $tmp = ($tmp & 0x0000003F) << $shift; $mUcs4 |= $tmp; // Prefix: End of the multi-octet sequence. mUcs4 now contains the final // Unicode code point to be output. if (--$mState === 0) { // Check for illegal sequences and code points. // // From Unicode 3.1, non-shortest form is illegal if ( ($mBytes === 2 && $mUcs4 < 0x0080) || ($mBytes === 3 && $mUcs4 < 0x0800) || ($mBytes === 4 && $mUcs4 < 0x10000) || ($mBytes > 4) || // From Unicode 3.2, surrogate characters are illegal. (($mUcs4 & 0xFFFFF800) === 0xD800) || // Code points outside the Unicode range are illegal. ($mUcs4 > 0x10FFFF) ) { return false; } // initialize UTF8 cache $mState = 0; $mUcs4 = 0; $mBytes = 1; } } else { // ((0xC0 & (*in) != 0x80) && (mState != 0)) // Incomplete multi-octet sequence. return false; } } return $mState === 0; } /** * @param string $str * @param bool $use_lowercase

Use uppercase by default, otherwise use lowercase.

* @param bool $use_full_case_fold

Convert not only common cases.

* * @psalm-pure * * @return string */ private static function fixStrCaseHelper( string $str, bool $use_lowercase = false, bool $use_full_case_fold = false ) { $upper = self::$COMMON_CASE_FOLD['upper']; $lower = self::$COMMON_CASE_FOLD['lower']; if ($use_lowercase) { $str = \str_replace( $upper, $lower, $str ); } else { $str = \str_replace( $lower, $upper, $str ); } if ($use_full_case_fold) { /** * @psalm-suppress ImpureStaticVariable * * @var array|null */ static $FULL_CASE_FOLD = null; if ($FULL_CASE_FOLD === null) { $FULL_CASE_FOLD = self::getData('caseFolding_full'); } if ($use_lowercase) { $str = \str_replace($FULL_CASE_FOLD[0], $FULL_CASE_FOLD[1], $str); } else { $str = \str_replace($FULL_CASE_FOLD[1], $FULL_CASE_FOLD[0], $str); } } return $str; } /** * get data from "/data/*.php" * * @param string $file * * @psalm-pure * * @return array */ private static function getData(string $file) { /** @noinspection PhpIncludeInspection */ /** @noinspection UsingInclusionReturnValueInspection */ /** @psalm-suppress UnresolvableInclude */ return include __DIR__ . '/data/' . $file . '.php'; } /** * @psalm-pure * * @return true|null */ private static function initEmojiData() { if (self::$EMOJI_KEYS_CACHE === null) { if (self::$EMOJI === null) { self::$EMOJI = self::getData('emoji'); } /** * @psalm-suppress ImpureFunctionCall - static sort function is used */ \uksort( self::$EMOJI, static function (string $a, string $b): int { return \strlen($b) <=> \strlen($a); } ); self::$EMOJI_KEYS_CACHE = \array_keys(self::$EMOJI); self::$EMOJI_VALUES_CACHE = self::$EMOJI; foreach (self::$EMOJI_KEYS_CACHE as $key) { $tmp_key = \crc32($key); self::$EMOJI_KEYS_REVERSIBLE_CACHE[] = '_-_PORTABLE_UTF8_-_' . $tmp_key . '_-_' . \strrev((string) $tmp_key) . '_-_8FTU_ELBATROP_-_'; } return true; } return null; } /** * Checks whether mbstring "overloaded" is active on the server. * * @psalm-pure * * @return bool */ private static function mbstring_overloaded(): bool { /** * INI directive 'mbstring.func_overload' is deprecated since PHP 7.2 */ /** @noinspection PhpComposerExtensionStubsInspection */ /** @noinspection PhpUsageOfSilenceOperatorInspection */ /** @noinspection DeprecatedIniOptionsInspection */ return \defined('MB_OVERLOAD_STRING') && ((int) @\ini_get('mbstring.func_overload') & \MB_OVERLOAD_STRING); } /** * @param string[] $strings * @param bool $remove_empty_values * @param int|null $remove_short_values * * @psalm-pure * * @return list */ private static function reduce_string_array( array $strings, bool $remove_empty_values, int $remove_short_values = null ) { // init $return = []; foreach ($strings as &$str) { if ( $remove_short_values !== null && \mb_strlen($str) <= $remove_short_values ) { continue; } if ( $remove_empty_values && \trim($str) === '' ) { continue; } $return[] = $str; } return $return; } /** * rxClass * * @param string $s * @param string $class * * @return string * * @psalm-pure */ private static function rxClass(string $s, string $class = '') { /** * @psalm-suppress ImpureStaticVariable * * @var array */ static $RX_CLASS_CACHE = []; $cache_key = $s . '_' . $class; if (isset($RX_CLASS_CACHE[$cache_key])) { return $RX_CLASS_CACHE[$cache_key]; } $class_array = []; $class_array[] = $class; /** @noinspection SuspiciousLoopInspection */ /** @noinspection AlterInForeachInspection */ foreach (self::str_split($s) as &$s) { if ($s === '-') { $class_array[0] = '-' . $class_array[0]; } elseif (!isset($s[2])) { $class_array[0] .= \preg_quote($s, '/'); } elseif (self::strlen($s) === 1) { $class_array[0] .= $s; } else { $class_array[] = $s; } } if ($class_array[0]) { $class_array[0] = '[' . $class_array[0] . ']'; } if (\count($class_array) === 1) { $return = $class_array[0]; } else { $return = '(?:' . \implode('|', $class_array) . ')'; } $RX_CLASS_CACHE[$cache_key] = $return; return $return; } /** * Personal names such as "Marcus Aurelius" are sometimes typed incorrectly using lowercase ("marcus aurelius"). * * @param string $names * @param string $delimiter * @param string $encoding * * @phpstan-param non-empty-string $delimiter * * @psalm-pure * * @return string */ private static function str_capitalize_name_helper( string $names, string $delimiter, string $encoding = 'UTF-8' ) { // init try { $name_helper_array = \explode($delimiter, $names); } catch (\Error $e) { // PHP >= 8.0 : explode() will now throw ValueError when separator parameter is given an empty string (""). Previously, explode() returned false instead. $name_helper_array = false; } if ($name_helper_array === false) { return ''; } $special_cases = [ 'names' => [ 'ab', 'af', 'al', 'and', 'ap', 'bint', 'binte', 'da', 'de', 'del', 'den', 'der', 'di', 'dit', 'ibn', 'la', 'mac', 'nic', 'of', 'ter', 'the', 'und', 'van', 'von', 'y', 'zu', ], 'prefixes' => [ 'al-', "d'", 'ff', "l'", 'mac', 'mc', 'nic', ], ]; foreach ($name_helper_array as &$name) { if (\in_array($name, $special_cases['names'], true)) { continue; } $continue = false; if ($delimiter === '-') { foreach ((array) $special_cases['names'] as &$beginning) { if (\strncmp($name, $beginning, \strlen($beginning)) === 0) { $continue = true; break; } } unset($beginning); } foreach ((array) $special_cases['prefixes'] as &$beginning) { if (\strncmp($name, $beginning, \strlen($beginning)) === 0) { $continue = true; break; } } unset($beginning); if ($continue) { continue; } $name = self::ucfirst($name, $encoding); } return \implode($delimiter, $name_helper_array); } /** * Generic case-sensitive transformation for collation matching. * * @param string $str

The input string

* * @psalm-pure * * @return string|null */ private static function strtonatfold(string $str) { $str = \Normalizer::normalize($str, \Normalizer::NFD); if ($str === false) { return ''; } return \preg_replace( '/\p{Mn}+/u', '', $str ); } /** * @param int|string $input * * @psalm-pure * * @return string */ private static function to_utf8_convert_helper($input) { // init $buf = ''; if (self::$ORD === null) { self::$ORD = self::getData('ord'); } if (self::$CHR === null) { self::$CHR = self::getData('chr'); } if (self::$WIN1252_TO_UTF8 === null) { self::$WIN1252_TO_UTF8 = self::getData('win1252_to_utf8'); } $ordC1 = self::$ORD[$input]; if (isset(self::$WIN1252_TO_UTF8[$ordC1])) { // found in Windows-1252 special cases $buf .= self::$WIN1252_TO_UTF8[$ordC1]; } else { $cc1 = self::$CHR[$ordC1 / 64] | "\xC0"; $cc2 = ((string) $input & "\x3F") | "\x80"; $buf .= $cc1 . $cc2; } return $buf; } /** * @param string $str * * @psalm-pure * * @return string */ private static function urldecode_unicode_helper(string $str) { if (\strpos($str, '%u') === false) { return $str; } $pattern = '/%u([0-9a-fA-F]{3,4})/'; if (\preg_match($pattern, $str)) { $str = (string) \preg_replace($pattern, '&#x\\1;', $str); } return $str; } }