[ Index ] |
PHP Cross Reference of WordPress |
[Summary view] [Print] [Text view]
1 <?php 2 3 /** 4 * IDNA URL encoder 5 * 6 * Note: Not fully compliant, as nameprep does nothing yet. 7 * 8 * @package Requests 9 * @subpackage Utilities 10 * @see https://tools.ietf.org/html/rfc3490 IDNA specification 11 * @see https://tools.ietf.org/html/rfc3492 Punycode/Bootstrap specification 12 */ 13 class Requests_IDNAEncoder { 14 /** 15 * ACE prefix used for IDNA 16 * 17 * @see https://tools.ietf.org/html/rfc3490#section-5 18 * @var string 19 */ 20 const ACE_PREFIX = 'xn--'; 21 22 /**#@+ 23 * Bootstrap constant for Punycode 24 * 25 * @see https://tools.ietf.org/html/rfc3492#section-5 26 * @var int 27 */ 28 const BOOTSTRAP_BASE = 36; 29 const BOOTSTRAP_TMIN = 1; 30 const BOOTSTRAP_TMAX = 26; 31 const BOOTSTRAP_SKEW = 38; 32 const BOOTSTRAP_DAMP = 700; 33 const BOOTSTRAP_INITIAL_BIAS = 72; 34 const BOOTSTRAP_INITIAL_N = 128; 35 /**#@-*/ 36 37 /** 38 * Encode a hostname using Punycode 39 * 40 * @param string $string Hostname 41 * @return string Punycode-encoded hostname 42 */ 43 public static function encode($string) { 44 $parts = explode('.', $string); 45 foreach ($parts as &$part) { 46 $part = self::to_ascii($part); 47 } 48 return implode('.', $parts); 49 } 50 51 /** 52 * Convert a UTF-8 string to an ASCII string using Punycode 53 * 54 * @throws Requests_Exception Provided string longer than 64 ASCII characters (`idna.provided_too_long`) 55 * @throws Requests_Exception Prepared string longer than 64 ASCII characters (`idna.prepared_too_long`) 56 * @throws Requests_Exception Provided string already begins with xn-- (`idna.provided_is_prefixed`) 57 * @throws Requests_Exception Encoded string longer than 64 ASCII characters (`idna.encoded_too_long`) 58 * 59 * @param string $string ASCII or UTF-8 string (max length 64 characters) 60 * @return string ASCII string 61 */ 62 public static function to_ascii($string) { 63 // Step 1: Check if the string is already ASCII 64 if (self::is_ascii($string)) { 65 // Skip to step 7 66 if (strlen($string) < 64) { 67 return $string; 68 } 69 70 throw new Requests_Exception('Provided string is too long', 'idna.provided_too_long', $string); 71 } 72 73 // Step 2: nameprep 74 $string = self::nameprep($string); 75 76 // Step 3: UseSTD3ASCIIRules is false, continue 77 // Step 4: Check if it's ASCII now 78 if (self::is_ascii($string)) { 79 // Skip to step 7 80 if (strlen($string) < 64) { 81 return $string; 82 } 83 84 throw new Requests_Exception('Prepared string is too long', 'idna.prepared_too_long', $string); 85 } 86 87 // Step 5: Check ACE prefix 88 if (strpos($string, self::ACE_PREFIX) === 0) { 89 throw new Requests_Exception('Provided string begins with ACE prefix', 'idna.provided_is_prefixed', $string); 90 } 91 92 // Step 6: Encode with Punycode 93 $string = self::punycode_encode($string); 94 95 // Step 7: Prepend ACE prefix 96 $string = self::ACE_PREFIX . $string; 97 98 // Step 8: Check size 99 if (strlen($string) < 64) { 100 return $string; 101 } 102 103 throw new Requests_Exception('Encoded string is too long', 'idna.encoded_too_long', $string); 104 } 105 106 /** 107 * Check whether a given string contains only ASCII characters 108 * 109 * @internal (Testing found regex was the fastest implementation) 110 * 111 * @param string $string 112 * @return bool Is the string ASCII-only? 113 */ 114 protected static function is_ascii($string) { 115 return (preg_match('/(?:[^\x00-\x7F])/', $string) !== 1); 116 } 117 118 /** 119 * Prepare a string for use as an IDNA name 120 * 121 * @todo Implement this based on RFC 3491 and the newer 5891 122 * @param string $string 123 * @return string Prepared string 124 */ 125 protected static function nameprep($string) { 126 return $string; 127 } 128 129 /** 130 * Convert a UTF-8 string to a UCS-4 codepoint array 131 * 132 * Based on Requests_IRI::replace_invalid_with_pct_encoding() 133 * 134 * @throws Requests_Exception Invalid UTF-8 codepoint (`idna.invalidcodepoint`) 135 * @param string $input 136 * @return array Unicode code points 137 */ 138 protected static function utf8_to_codepoints($input) { 139 $codepoints = array(); 140 141 // Get number of bytes 142 $strlen = strlen($input); 143 144 for ($position = 0; $position < $strlen; $position++) { 145 $value = ord($input[$position]); 146 147 // One byte sequence: 148 if ((~$value & 0x80) === 0x80) { 149 $character = $value; 150 $length = 1; 151 $remaining = 0; 152 } 153 // Two byte sequence: 154 elseif (($value & 0xE0) === 0xC0) { 155 $character = ($value & 0x1F) << 6; 156 $length = 2; 157 $remaining = 1; 158 } 159 // Three byte sequence: 160 elseif (($value & 0xF0) === 0xE0) { 161 $character = ($value & 0x0F) << 12; 162 $length = 3; 163 $remaining = 2; 164 } 165 // Four byte sequence: 166 elseif (($value & 0xF8) === 0xF0) { 167 $character = ($value & 0x07) << 18; 168 $length = 4; 169 $remaining = 3; 170 } 171 // Invalid byte: 172 else { 173 throw new Requests_Exception('Invalid Unicode codepoint', 'idna.invalidcodepoint', $value); 174 } 175 176 if ($remaining > 0) { 177 if ($position + $length > $strlen) { 178 throw new Requests_Exception('Invalid Unicode codepoint', 'idna.invalidcodepoint', $character); 179 } 180 for ($position++; $remaining > 0; $position++) { 181 $value = ord($input[$position]); 182 183 // If it is invalid, count the sequence as invalid and reprocess the current byte: 184 if (($value & 0xC0) !== 0x80) { 185 throw new Requests_Exception('Invalid Unicode codepoint', 'idna.invalidcodepoint', $character); 186 } 187 188 $character |= ($value & 0x3F) << (--$remaining * 6); 189 } 190 $position--; 191 } 192 193 if ( 194 // Non-shortest form sequences are invalid 195 $length > 1 && $character <= 0x7F 196 || $length > 2 && $character <= 0x7FF 197 || $length > 3 && $character <= 0xFFFF 198 // Outside of range of ucschar codepoints 199 // Noncharacters 200 || ($character & 0xFFFE) === 0xFFFE 201 || $character >= 0xFDD0 && $character <= 0xFDEF 202 || ( 203 // Everything else not in ucschar 204 $character > 0xD7FF && $character < 0xF900 205 || $character < 0x20 206 || $character > 0x7E && $character < 0xA0 207 || $character > 0xEFFFD 208 ) 209 ) { 210 throw new Requests_Exception('Invalid Unicode codepoint', 'idna.invalidcodepoint', $character); 211 } 212 213 $codepoints[] = $character; 214 } 215 216 return $codepoints; 217 } 218 219 /** 220 * RFC3492-compliant encoder 221 * 222 * @internal Pseudo-code from Section 6.3 is commented with "#" next to relevant code 223 * @throws Requests_Exception On character outside of the domain (never happens with Punycode) (`idna.character_outside_domain`) 224 * 225 * @param string $input UTF-8 encoded string to encode 226 * @return string Punycode-encoded string 227 */ 228 public static function punycode_encode($input) { 229 $output = ''; 230 # let n = initial_n 231 $n = self::BOOTSTRAP_INITIAL_N; 232 # let delta = 0 233 $delta = 0; 234 # let bias = initial_bias 235 $bias = self::BOOTSTRAP_INITIAL_BIAS; 236 # let h = b = the number of basic code points in the input 237 $h = $b = 0; // see loop 238 # copy them to the output in order 239 $codepoints = self::utf8_to_codepoints($input); 240 $extended = array(); 241 242 foreach ($codepoints as $char) { 243 if ($char < 128) { 244 // Character is valid ASCII 245 // TODO: this should also check if it's valid for a URL 246 $output .= chr($char); 247 $h++; 248 } 249 // Check if the character is non-ASCII, but below initial n 250 // This never occurs for Punycode, so ignore in coverage 251 // @codeCoverageIgnoreStart 252 elseif ($char < $n) { 253 throw new Requests_Exception('Invalid character', 'idna.character_outside_domain', $char); 254 } 255 // @codeCoverageIgnoreEnd 256 else { 257 $extended[$char] = true; 258 } 259 } 260 $extended = array_keys($extended); 261 sort($extended); 262 $b = $h; 263 # [copy them] followed by a delimiter if b > 0 264 if (strlen($output) > 0) { 265 $output .= '-'; 266 } 267 # {if the input contains a non-basic code point < n then fail} 268 # while h < length(input) do begin 269 while ($h < count($codepoints)) { 270 # let m = the minimum code point >= n in the input 271 $m = array_shift($extended); 272 //printf('next code point to insert is %s' . PHP_EOL, dechex($m)); 273 # let delta = delta + (m - n) * (h + 1), fail on overflow 274 $delta += ($m - $n) * ($h + 1); 275 # let n = m 276 $n = $m; 277 # for each code point c in the input (in order) do begin 278 for ($num = 0; $num < count($codepoints); $num++) { 279 $c = $codepoints[$num]; 280 # if c < n then increment delta, fail on overflow 281 if ($c < $n) { 282 $delta++; 283 } 284 # if c == n then begin 285 elseif ($c === $n) { 286 # let q = delta 287 $q = $delta; 288 # for k = base to infinity in steps of base do begin 289 for ($k = self::BOOTSTRAP_BASE; ; $k += self::BOOTSTRAP_BASE) { 290 # let t = tmin if k <= bias {+ tmin}, or 291 # tmax if k >= bias + tmax, or k - bias otherwise 292 if ($k <= ($bias + self::BOOTSTRAP_TMIN)) { 293 $t = self::BOOTSTRAP_TMIN; 294 } 295 elseif ($k >= ($bias + self::BOOTSTRAP_TMAX)) { 296 $t = self::BOOTSTRAP_TMAX; 297 } 298 else { 299 $t = $k - $bias; 300 } 301 # if q < t then break 302 if ($q < $t) { 303 break; 304 } 305 # output the code point for digit t + ((q - t) mod (base - t)) 306 $digit = $t + (($q - $t) % (self::BOOTSTRAP_BASE - $t)); 307 $output .= self::digit_to_char($digit); 308 # let q = (q - t) div (base - t) 309 $q = floor(($q - $t) / (self::BOOTSTRAP_BASE - $t)); 310 # end 311 } 312 # output the code point for digit q 313 $output .= self::digit_to_char($q); 314 # let bias = adapt(delta, h + 1, test h equals b?) 315 $bias = self::adapt($delta, $h + 1, $h === $b); 316 # let delta = 0 317 $delta = 0; 318 # increment h 319 $h++; 320 # end 321 } 322 # end 323 } 324 # increment delta and n 325 $delta++; 326 $n++; 327 # end 328 } 329 330 return $output; 331 } 332 333 /** 334 * Convert a digit to its respective character 335 * 336 * @see https://tools.ietf.org/html/rfc3492#section-5 337 * @throws Requests_Exception On invalid digit (`idna.invalid_digit`) 338 * 339 * @param int $digit Digit in the range 0-35 340 * @return string Single character corresponding to digit 341 */ 342 protected static function digit_to_char($digit) { 343 // @codeCoverageIgnoreStart 344 // As far as I know, this never happens, but still good to be sure. 345 if ($digit < 0 || $digit > 35) { 346 throw new Requests_Exception(sprintf('Invalid digit %d', $digit), 'idna.invalid_digit', $digit); 347 } 348 // @codeCoverageIgnoreEnd 349 $digits = 'abcdefghijklmnopqrstuvwxyz0123456789'; 350 return substr($digits, $digit, 1); 351 } 352 353 /** 354 * Adapt the bias 355 * 356 * @see https://tools.ietf.org/html/rfc3492#section-6.1 357 * @param int $delta 358 * @param int $numpoints 359 * @param bool $firsttime 360 * @return int New bias 361 */ 362 protected static function adapt($delta, $numpoints, $firsttime) { 363 # function adapt(delta,numpoints,firsttime): 364 # if firsttime then let delta = delta div damp 365 if ($firsttime) { 366 $delta = floor($delta / self::BOOTSTRAP_DAMP); 367 } 368 # else let delta = delta div 2 369 else { 370 $delta = floor($delta / 2); 371 } 372 # let delta = delta + (delta div numpoints) 373 $delta += floor($delta / $numpoints); 374 # let k = 0 375 $k = 0; 376 # while delta > ((base - tmin) * tmax) div 2 do begin 377 $max = floor(((self::BOOTSTRAP_BASE - self::BOOTSTRAP_TMIN) * self::BOOTSTRAP_TMAX) / 2); 378 while ($delta > $max) { 379 # let delta = delta div (base - tmin) 380 $delta = floor($delta / (self::BOOTSTRAP_BASE - self::BOOTSTRAP_TMIN)); 381 # let k = k + base 382 $k += self::BOOTSTRAP_BASE; 383 # end 384 } 385 # return k + (((base - tmin + 1) * delta) div (delta + skew)) 386 return $k + floor(((self::BOOTSTRAP_BASE - self::BOOTSTRAP_TMIN + 1) * $delta) / ($delta + self::BOOTSTRAP_SKEW)); 387 } 388 }
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
Generated: Thu Mar 4 01:00:04 2021 | Cross-referenced by PHPXref 0.7.1 |