[ Index ] |
PHP Cross Reference of WordPress |
[Summary view] [Print] [Text view]
1 <?php 2 3 /** 4 * IDNA URL encoder 5 * 6 * Note: Not fully compliant, as nameprep does nothing yet. 7 * 8 * @package Requests 9 * @subpackage Utilities 10 * @see https://tools.ietf.org/html/rfc3490 IDNA specification 11 * @see https://tools.ietf.org/html/rfc3492 Punycode/Bootstrap specification 12 */ 13 class Requests_IDNAEncoder { 14 /** 15 * ACE prefix used for IDNA 16 * 17 * @see https://tools.ietf.org/html/rfc3490#section-5 18 * @var string 19 */ 20 const ACE_PREFIX = 'xn--'; 21 22 /**#@+ 23 * Bootstrap constant for Punycode 24 * 25 * @see https://tools.ietf.org/html/rfc3492#section-5 26 * @var int 27 */ 28 const BOOTSTRAP_BASE = 36; 29 const BOOTSTRAP_TMIN = 1; 30 const BOOTSTRAP_TMAX = 26; 31 const BOOTSTRAP_SKEW = 38; 32 const BOOTSTRAP_DAMP = 700; 33 const BOOTSTRAP_INITIAL_BIAS = 72; 34 const BOOTSTRAP_INITIAL_N = 128; 35 /**#@-*/ 36 37 /** 38 * Encode a hostname using Punycode 39 * 40 * @param string $string Hostname 41 * @return string Punycode-encoded hostname 42 */ 43 public static function encode($string) { 44 $parts = explode('.', $string); 45 foreach ($parts as &$part) { 46 $part = self::to_ascii($part); 47 } 48 return implode('.', $parts); 49 } 50 51 /** 52 * Convert a UTF-8 string to an ASCII string using Punycode 53 * 54 * @throws Requests_Exception Provided string longer than 64 ASCII characters (`idna.provided_too_long`) 55 * @throws Requests_Exception Prepared string longer than 64 ASCII characters (`idna.prepared_too_long`) 56 * @throws Requests_Exception Provided string already begins with xn-- (`idna.provided_is_prefixed`) 57 * @throws Requests_Exception Encoded string longer than 64 ASCII characters (`idna.encoded_too_long`) 58 * 59 * @param string $string ASCII or UTF-8 string (max length 64 characters) 60 * @return string ASCII string 61 */ 62 public static function to_ascii($string) { 63 // Step 1: Check if the string is already ASCII 64 if (self::is_ascii($string)) { 65 // Skip to step 7 66 if (strlen($string) < 64) { 67 return $string; 68 } 69 70 throw new Requests_Exception('Provided string is too long', 'idna.provided_too_long', $string); 71 } 72 73 // Step 2: nameprep 74 $string = self::nameprep($string); 75 76 // Step 3: UseSTD3ASCIIRules is false, continue 77 // Step 4: Check if it's ASCII now 78 if (self::is_ascii($string)) { 79 // Skip to step 7 80 if (strlen($string) < 64) { 81 return $string; 82 } 83 84 throw new Requests_Exception('Prepared string is too long', 'idna.prepared_too_long', $string); 85 } 86 87 // Step 5: Check ACE prefix 88 if (strpos($string, self::ACE_PREFIX) === 0) { 89 throw new Requests_Exception('Provided string begins with ACE prefix', 'idna.provided_is_prefixed', $string); 90 } 91 92 // Step 6: Encode with Punycode 93 $string = self::punycode_encode($string); 94 95 // Step 7: Prepend ACE prefix 96 $string = self::ACE_PREFIX . $string; 97 98 // Step 8: Check size 99 if (strlen($string) < 64) { 100 return $string; 101 } 102 103 throw new Requests_Exception('Encoded string is too long', 'idna.encoded_too_long', $string); 104 } 105 106 /** 107 * Check whether a given string contains only ASCII characters 108 * 109 * @internal (Testing found regex was the fastest implementation) 110 * 111 * @param string $string 112 * @return bool Is the string ASCII-only? 113 */ 114 protected static function is_ascii($string) { 115 return (preg_match('/(?:[^\x00-\x7F])/', $string) !== 1); 116 } 117 118 /** 119 * Prepare a string for use as an IDNA name 120 * 121 * @todo Implement this based on RFC 3491 and the newer 5891 122 * @param string $string 123 * @return string Prepared string 124 */ 125 protected static function nameprep($string) { 126 return $string; 127 } 128 129 /** 130 * Convert a UTF-8 string to a UCS-4 codepoint array 131 * 132 * Based on Requests_IRI::replace_invalid_with_pct_encoding() 133 * 134 * @throws Requests_Exception Invalid UTF-8 codepoint (`idna.invalidcodepoint`) 135 * @param string $input 136 * @return array Unicode code points 137 */ 138 protected static function utf8_to_codepoints($input) { 139 $codepoints = array(); 140 141 // Get number of bytes 142 $strlen = strlen($input); 143 144 // phpcs:ignore Generic.CodeAnalysis.JumbledIncrementer -- This is a deliberate choice. 145 for ($position = 0; $position < $strlen; $position++) { 146 $value = ord($input[$position]); 147 148 // One byte sequence: 149 if ((~$value & 0x80) === 0x80) { 150 $character = $value; 151 $length = 1; 152 $remaining = 0; 153 } 154 // Two byte sequence: 155 elseif (($value & 0xE0) === 0xC0) { 156 $character = ($value & 0x1F) << 6; 157 $length = 2; 158 $remaining = 1; 159 } 160 // Three byte sequence: 161 elseif (($value & 0xF0) === 0xE0) { 162 $character = ($value & 0x0F) << 12; 163 $length = 3; 164 $remaining = 2; 165 } 166 // Four byte sequence: 167 elseif (($value & 0xF8) === 0xF0) { 168 $character = ($value & 0x07) << 18; 169 $length = 4; 170 $remaining = 3; 171 } 172 // Invalid byte: 173 else { 174 throw new Requests_Exception('Invalid Unicode codepoint', 'idna.invalidcodepoint', $value); 175 } 176 177 if ($remaining > 0) { 178 if ($position + $length > $strlen) { 179 throw new Requests_Exception('Invalid Unicode codepoint', 'idna.invalidcodepoint', $character); 180 } 181 for ($position++; $remaining > 0; $position++) { 182 $value = ord($input[$position]); 183 184 // If it is invalid, count the sequence as invalid and reprocess the current byte: 185 if (($value & 0xC0) !== 0x80) { 186 throw new Requests_Exception('Invalid Unicode codepoint', 'idna.invalidcodepoint', $character); 187 } 188 189 --$remaining; 190 $character |= ($value & 0x3F) << ($remaining * 6); 191 } 192 $position--; 193 } 194 195 if (// Non-shortest form sequences are invalid 196 $length > 1 && $character <= 0x7F 197 || $length > 2 && $character <= 0x7FF 198 || $length > 3 && $character <= 0xFFFF 199 // Outside of range of ucschar codepoints 200 // Noncharacters 201 || ($character & 0xFFFE) === 0xFFFE 202 || $character >= 0xFDD0 && $character <= 0xFDEF 203 || ( 204 // Everything else not in ucschar 205 $character > 0xD7FF && $character < 0xF900 206 || $character < 0x20 207 || $character > 0x7E && $character < 0xA0 208 || $character > 0xEFFFD 209 ) 210 ) { 211 throw new Requests_Exception('Invalid Unicode codepoint', 'idna.invalidcodepoint', $character); 212 } 213 214 $codepoints[] = $character; 215 } 216 217 return $codepoints; 218 } 219 220 /** 221 * RFC3492-compliant encoder 222 * 223 * @internal Pseudo-code from Section 6.3 is commented with "#" next to relevant code 224 * @throws Requests_Exception On character outside of the domain (never happens with Punycode) (`idna.character_outside_domain`) 225 * 226 * @param string $input UTF-8 encoded string to encode 227 * @return string Punycode-encoded string 228 */ 229 public static function punycode_encode($input) { 230 $output = ''; 231 // let n = initial_n 232 $n = self::BOOTSTRAP_INITIAL_N; 233 // let delta = 0 234 $delta = 0; 235 // let bias = initial_bias 236 $bias = self::BOOTSTRAP_INITIAL_BIAS; 237 // let h = b = the number of basic code points in the input 238 $h = 0; 239 $b = 0; // see loop 240 // copy them to the output in order 241 $codepoints = self::utf8_to_codepoints($input); 242 $extended = array(); 243 244 foreach ($codepoints as $char) { 245 if ($char < 128) { 246 // Character is valid ASCII 247 // TODO: this should also check if it's valid for a URL 248 $output .= chr($char); 249 $h++; 250 } 251 // Check if the character is non-ASCII, but below initial n 252 // This never occurs for Punycode, so ignore in coverage 253 // @codeCoverageIgnoreStart 254 elseif ($char < $n) { 255 throw new Requests_Exception('Invalid character', 'idna.character_outside_domain', $char); 256 } 257 // @codeCoverageIgnoreEnd 258 else { 259 $extended[$char] = true; 260 } 261 } 262 $extended = array_keys($extended); 263 sort($extended); 264 $b = $h; 265 // [copy them] followed by a delimiter if b > 0 266 if (strlen($output) > 0) { 267 $output .= '-'; 268 } 269 // {if the input contains a non-basic code point < n then fail} 270 // while h < length(input) do begin 271 $codepointcount = count($codepoints); 272 while ($h < $codepointcount) { 273 // let m = the minimum code point >= n in the input 274 $m = array_shift($extended); 275 //printf('next code point to insert is %s' . PHP_EOL, dechex($m)); 276 // let delta = delta + (m - n) * (h + 1), fail on overflow 277 $delta += ($m - $n) * ($h + 1); 278 // let n = m 279 $n = $m; 280 // for each code point c in the input (in order) do begin 281 for ($num = 0; $num < $codepointcount; $num++) { 282 $c = $codepoints[$num]; 283 // if c < n then increment delta, fail on overflow 284 if ($c < $n) { 285 $delta++; 286 } 287 // if c == n then begin 288 elseif ($c === $n) { 289 // let q = delta 290 $q = $delta; 291 // for k = base to infinity in steps of base do begin 292 for ($k = self::BOOTSTRAP_BASE; ; $k += self::BOOTSTRAP_BASE) { 293 // let t = tmin if k <= bias {+ tmin}, or 294 // tmax if k >= bias + tmax, or k - bias otherwise 295 if ($k <= ($bias + self::BOOTSTRAP_TMIN)) { 296 $t = self::BOOTSTRAP_TMIN; 297 } 298 elseif ($k >= ($bias + self::BOOTSTRAP_TMAX)) { 299 $t = self::BOOTSTRAP_TMAX; 300 } 301 else { 302 $t = $k - $bias; 303 } 304 // if q < t then break 305 if ($q < $t) { 306 break; 307 } 308 // output the code point for digit t + ((q - t) mod (base - t)) 309 $digit = $t + (($q - $t) % (self::BOOTSTRAP_BASE - $t)); 310 $output .= self::digit_to_char($digit); 311 // let q = (q - t) div (base - t) 312 $q = floor(($q - $t) / (self::BOOTSTRAP_BASE - $t)); 313 } // end 314 // output the code point for digit q 315 $output .= self::digit_to_char($q); 316 // let bias = adapt(delta, h + 1, test h equals b?) 317 $bias = self::adapt($delta, $h + 1, $h === $b); 318 // let delta = 0 319 $delta = 0; 320 // increment h 321 $h++; 322 } // end 323 } // end 324 // increment delta and n 325 $delta++; 326 $n++; 327 } // end 328 329 return $output; 330 } 331 332 /** 333 * Convert a digit to its respective character 334 * 335 * @see https://tools.ietf.org/html/rfc3492#section-5 336 * @throws Requests_Exception On invalid digit (`idna.invalid_digit`) 337 * 338 * @param int $digit Digit in the range 0-35 339 * @return string Single character corresponding to digit 340 */ 341 protected static function digit_to_char($digit) { 342 // @codeCoverageIgnoreStart 343 // As far as I know, this never happens, but still good to be sure. 344 if ($digit < 0 || $digit > 35) { 345 throw new Requests_Exception(sprintf('Invalid digit %d', $digit), 'idna.invalid_digit', $digit); 346 } 347 // @codeCoverageIgnoreEnd 348 $digits = 'abcdefghijklmnopqrstuvwxyz0123456789'; 349 return substr($digits, $digit, 1); 350 } 351 352 /** 353 * Adapt the bias 354 * 355 * @see https://tools.ietf.org/html/rfc3492#section-6.1 356 * @param int $delta 357 * @param int $numpoints 358 * @param bool $firsttime 359 * @return int New bias 360 * 361 * function adapt(delta,numpoints,firsttime): 362 */ 363 protected static function adapt($delta, $numpoints, $firsttime) { 364 // if firsttime then let delta = delta div damp 365 if ($firsttime) { 366 $delta = floor($delta / self::BOOTSTRAP_DAMP); 367 } 368 // else let delta = delta div 2 369 else { 370 $delta = floor($delta / 2); 371 } 372 // let delta = delta + (delta div numpoints) 373 $delta += floor($delta / $numpoints); 374 // let k = 0 375 $k = 0; 376 // while delta > ((base - tmin) * tmax) div 2 do begin 377 $max = floor(((self::BOOTSTRAP_BASE - self::BOOTSTRAP_TMIN) * self::BOOTSTRAP_TMAX) / 2); 378 while ($delta > $max) { 379 // let delta = delta div (base - tmin) 380 $delta = floor($delta / (self::BOOTSTRAP_BASE - self::BOOTSTRAP_TMIN)); 381 // let k = k + base 382 $k += self::BOOTSTRAP_BASE; 383 } // end 384 // return k + (((base - tmin + 1) * delta) div (delta + skew)) 385 return $k + floor(((self::BOOTSTRAP_BASE - self::BOOTSTRAP_TMIN + 1) * $delta) / ($delta + self::BOOTSTRAP_SKEW)); 386 } 387 }
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
Generated: Thu Dec 15 01:00:02 2022 | Cross-referenced by PHPXref 0.7.1 |