[ Index ] |
PHP Cross Reference of WordPress |
[Summary view] [Print] [Text view]
1 <?php 2 /** 3 * REST API: WP_REST_URL_Details_Controller class 4 * 5 * @package WordPress 6 * @subpackage REST_API 7 * @since 5.9.0 8 */ 9 10 /** 11 * Controller which provides REST endpoint for retrieving information 12 * from a remote site's HTML response. 13 * 14 * @since 5.9.0 15 * 16 * @see WP_REST_Controller 17 */ 18 class WP_REST_URL_Details_Controller extends WP_REST_Controller { 19 20 /** 21 * Constructs the controller. 22 * 23 * @since 5.9.0 24 */ 25 public function __construct() { 26 $this->namespace = 'wp-block-editor/v1'; 27 $this->rest_base = 'url-details'; 28 } 29 30 /** 31 * Registers the necessary REST API routes. 32 * 33 * @since 5.9.0 34 */ 35 public function register_routes() { 36 register_rest_route( 37 $this->namespace, 38 '/' . $this->rest_base, 39 array( 40 array( 41 'methods' => WP_REST_Server::READABLE, 42 'callback' => array( $this, 'parse_url_details' ), 43 'args' => array( 44 'url' => array( 45 'required' => true, 46 'description' => __( 'The URL to process.' ), 47 'validate_callback' => 'wp_http_validate_url', 48 'sanitize_callback' => 'esc_url_raw', 49 'type' => 'string', 50 'format' => 'uri', 51 ), 52 ), 53 'permission_callback' => array( $this, 'permissions_check' ), 54 'schema' => array( $this, 'get_public_item_schema' ), 55 ), 56 ) 57 ); 58 } 59 60 /** 61 * Retrieves the item's schema, conforming to JSON Schema. 62 * 63 * @since 5.9.0 64 * 65 * @return array Item schema data. 66 */ 67 public function get_item_schema() { 68 if ( $this->schema ) { 69 return $this->add_additional_fields_schema( $this->schema ); 70 } 71 72 $this->schema = array( 73 '$schema' => 'http://json-schema.org/draft-04/schema#', 74 'title' => 'url-details', 75 'type' => 'object', 76 'properties' => array( 77 'title' => array( 78 'description' => sprintf( 79 /* translators: %s: HTML title tag. */ 80 __( 'The contents of the %s element from the URL.' ), 81 '<title>' 82 ), 83 'type' => 'string', 84 'context' => array( 'view', 'edit', 'embed' ), 85 'readonly' => true, 86 ), 87 'icon' => array( 88 'description' => sprintf( 89 /* translators: %s: HTML link tag. */ 90 __( 'The favicon image link of the %s element from the URL.' ), 91 '<link rel="icon">' 92 ), 93 'type' => 'string', 94 'format' => 'uri', 95 'context' => array( 'view', 'edit', 'embed' ), 96 'readonly' => true, 97 ), 98 'description' => array( 99 'description' => sprintf( 100 /* translators: %s: HTML meta tag. */ 101 __( 'The content of the %s element from the URL.' ), 102 '<meta name="description">' 103 ), 104 'type' => 'string', 105 'context' => array( 'view', 'edit', 'embed' ), 106 'readonly' => true, 107 ), 108 'image' => array( 109 'description' => sprintf( 110 /* translators: 1: HTML meta tag, 2: HTML meta tag. */ 111 __( 'The Open Graph image link of the %1$s or %2$s element from the URL.' ), 112 '<meta property="og:image">', 113 '<meta property="og:image:url">' 114 ), 115 'type' => 'string', 116 'format' => 'uri', 117 'context' => array( 'view', 'edit', 'embed' ), 118 'readonly' => true, 119 ), 120 ), 121 ); 122 123 return $this->add_additional_fields_schema( $this->schema ); 124 } 125 126 /** 127 * Retrieves the contents of the title tag from the HTML response. 128 * 129 * @since 5.9.0 130 * 131 * @param WP_REST_REQUEST $request Full details about the request. 132 * @return WP_REST_Response|WP_Error The parsed details as a response object. WP_Error if there are errors. 133 */ 134 public function parse_url_details( $request ) { 135 $url = untrailingslashit( $request['url'] ); 136 137 if ( empty( $url ) ) { 138 return new WP_Error( 'rest_invalid_url', __( 'Invalid URL' ), array( 'status' => 404 ) ); 139 } 140 141 // Transient per URL. 142 $cache_key = $this->build_cache_key_for_url( $url ); 143 144 // Attempt to retrieve cached response. 145 $cached_response = $this->get_cache( $cache_key ); 146 147 if ( ! empty( $cached_response ) ) { 148 $remote_url_response = $cached_response; 149 } else { 150 $remote_url_response = $this->get_remote_url( $url ); 151 152 // Exit if we don't have a valid body or it's empty. 153 if ( is_wp_error( $remote_url_response ) || empty( $remote_url_response ) ) { 154 return $remote_url_response; 155 } 156 157 // Cache the valid response. 158 $this->set_cache( $cache_key, $remote_url_response ); 159 } 160 161 $html_head = $this->get_document_head( $remote_url_response ); 162 $meta_elements = $this->get_meta_with_content_elements( $html_head ); 163 164 $data = $this->add_additional_fields_to_object( 165 array( 166 'title' => $this->get_title( $html_head ), 167 'icon' => $this->get_icon( $html_head, $url ), 168 'description' => $this->get_description( $meta_elements ), 169 'image' => $this->get_image( $meta_elements, $url ), 170 ), 171 $request 172 ); 173 174 // Wrap the data in a response object. 175 $response = rest_ensure_response( $data ); 176 177 /** 178 * Filters the URL data for the response. 179 * 180 * @since 5.9.0 181 * 182 * @param WP_REST_Response $response The response object. 183 * @param string $url The requested URL. 184 * @param WP_REST_Request $request Request object. 185 * @param string $remote_url_response HTTP response body from the remote URL. 186 */ 187 return apply_filters( 'rest_prepare_url_details', $response, $url, $request, $remote_url_response ); 188 } 189 190 /** 191 * Checks whether a given request has permission to read remote URLs. 192 * 193 * @since 5.9.0 194 * 195 * @return WP_Error|bool True if the request has permission, else WP_Error. 196 */ 197 public function permissions_check() { 198 if ( current_user_can( 'edit_posts' ) ) { 199 return true; 200 } 201 202 foreach ( get_post_types( array( 'show_in_rest' => true ), 'objects' ) as $post_type ) { 203 if ( current_user_can( $post_type->cap->edit_posts ) ) { 204 return true; 205 } 206 } 207 208 return new WP_Error( 209 'rest_cannot_view_url_details', 210 __( 'Sorry, you are not allowed to process remote URLs.' ), 211 array( 'status' => rest_authorization_required_code() ) 212 ); 213 } 214 215 /** 216 * Retrieves the document title from a remote URL. 217 * 218 * @since 5.9.0 219 * 220 * @param string $url The website URL whose HTML to access. 221 * @return string|WP_Error The HTTP response from the remote URL on success. 222 * WP_Error if no response or no content. 223 */ 224 private function get_remote_url( $url ) { 225 226 /* 227 * Provide a modified UA string to workaround web properties which block WordPress "Pingbacks". 228 * Why? The UA string used for pingback requests contains `WordPress/` which is very similar 229 * to that used as the default UA string by the WP HTTP API. Therefore requests from this 230 * REST endpoint are being unintentionally blocked as they are misidentified as pingback requests. 231 * By slightly modifying the UA string, but still retaining the "WordPress" identification (via "WP") 232 * we are able to work around this issue. 233 * Example UA string: `WP-URLDetails/5.9-alpha-51389 (+http://localhost:8888)`. 234 */ 235 $modified_user_agent = 'WP-URLDetails/' . get_bloginfo( 'version' ) . ' (+' . get_bloginfo( 'url' ) . ')'; 236 237 $args = array( 238 'limit_response_size' => 150 * KB_IN_BYTES, 239 'user-agent' => $modified_user_agent, 240 ); 241 242 /** 243 * Filters the HTTP request args for URL data retrieval. 244 * 245 * Can be used to adjust response size limit and other WP_Http::request() args. 246 * 247 * @since 5.9.0 248 * 249 * @param array $args Arguments used for the HTTP request. 250 * @param string $url The attempted URL. 251 */ 252 $args = apply_filters( 'rest_url_details_http_request_args', $args, $url ); 253 254 $response = wp_safe_remote_get( $url, $args ); 255 256 if ( WP_Http::OK !== wp_remote_retrieve_response_code( $response ) ) { 257 // Not saving the error response to cache since the error might be temporary. 258 return new WP_Error( 259 'no_response', 260 __( 'URL not found. Response returned a non-200 status code for this URL.' ), 261 array( 'status' => WP_Http::NOT_FOUND ) 262 ); 263 } 264 265 $remote_body = wp_remote_retrieve_body( $response ); 266 267 if ( empty( $remote_body ) ) { 268 return new WP_Error( 269 'no_content', 270 __( 'Unable to retrieve body from response at this URL.' ), 271 array( 'status' => WP_Http::NOT_FOUND ) 272 ); 273 } 274 275 return $remote_body; 276 } 277 278 /** 279 * Parses the title tag contents from the provided HTML. 280 * 281 * @since 5.9.0 282 * 283 * @param string $html The HTML from the remote website at URL. 284 * @return string The title tag contents on success. Empty string if not found. 285 */ 286 private function get_title( $html ) { 287 $pattern = '#<title[^>]*>(.*?)<\s*/\s*title>#is'; 288 preg_match( $pattern, $html, $match_title ); 289 290 if ( empty( $match_title[1] ) || ! is_string( $match_title[1] ) ) { 291 return ''; 292 } 293 294 $title = trim( $match_title[1] ); 295 296 return $this->prepare_metadata_for_output( $title ); 297 } 298 299 /** 300 * Parses the site icon from the provided HTML. 301 * 302 * @since 5.9.0 303 * 304 * @param string $html The HTML from the remote website at URL. 305 * @param string $url The target website URL. 306 * @return string The icon URI on success. Empty string if not found. 307 */ 308 private function get_icon( $html, $url ) { 309 // Grab the icon's link element. 310 $pattern = '#<link\s[^>]*rel=(?:[\"\']??)\s*(?:icon|shortcut icon|icon shortcut)\s*(?:[\"\']??)[^>]*\/?>#isU'; 311 preg_match( $pattern, $html, $element ); 312 if ( empty( $element[0] ) || ! is_string( $element[0] ) ) { 313 return ''; 314 } 315 $element = trim( $element[0] ); 316 317 // Get the icon's href value. 318 $pattern = '#href=([\"\']??)([^\" >]*?)\\1[^>]*#isU'; 319 preg_match( $pattern, $element, $icon ); 320 if ( empty( $icon[2] ) || ! is_string( $icon[2] ) ) { 321 return ''; 322 } 323 $icon = trim( $icon[2] ); 324 325 // If the icon is a data URL, return it. 326 $parsed_icon = parse_url( $icon ); 327 if ( isset( $parsed_icon['scheme'] ) && 'data' === $parsed_icon['scheme'] ) { 328 return $icon; 329 } 330 331 // Attempt to convert relative URLs to absolute. 332 if ( ! is_string( $url ) || '' === $url ) { 333 return $icon; 334 } 335 $parsed_url = parse_url( $url ); 336 if ( isset( $parsed_url['scheme'] ) && isset( $parsed_url['host'] ) ) { 337 $root_url = $parsed_url['scheme'] . '://' . $parsed_url['host'] . '/'; 338 $icon = WP_Http::make_absolute_url( $icon, $root_url ); 339 } 340 341 return $icon; 342 } 343 344 /** 345 * Parses the meta description from the provided HTML. 346 * 347 * @since 5.9.0 348 * 349 * @param array $meta_elements { 350 * A multi-dimensional indexed array on success, else empty array. 351 * 352 * @type string[] $0 Meta elements with a content attribute. 353 * @type string[] $1 Content attribute's opening quotation mark. 354 * @type string[] $2 Content attribute's value for each meta element. 355 * } 356 * @return string The meta description contents on success. Empty string if not found. 357 */ 358 private function get_description( $meta_elements ) { 359 // Bail out if there are no meta elements. 360 if ( empty( $meta_elements[0] ) ) { 361 return ''; 362 } 363 364 $description = $this->get_metadata_from_meta_element( 365 $meta_elements, 366 'name', 367 '(?:description|og:description)' 368 ); 369 370 // Bail out if description not found. 371 if ( '' === $description ) { 372 return ''; 373 } 374 375 return $this->prepare_metadata_for_output( $description ); 376 } 377 378 /** 379 * Parses the Open Graph (OG) Image from the provided HTML. 380 * 381 * See: https://ogp.me/. 382 * 383 * @since 5.9.0 384 * 385 * @param array $meta_elements { 386 * A multi-dimensional indexed array on success, else empty array. 387 * 388 * @type string[] $0 Meta elements with a content attribute. 389 * @type string[] $1 Content attribute's opening quotation mark. 390 * @type string[] $2 Content attribute's value for each meta element. 391 * } 392 * @param string $url The target website URL. 393 * @return string The OG image on success. Empty string if not found. 394 */ 395 private function get_image( $meta_elements, $url ) { 396 $image = $this->get_metadata_from_meta_element( 397 $meta_elements, 398 'property', 399 '(?:og:image|og:image:url)' 400 ); 401 402 // Bail out if image not found. 403 if ( '' === $image ) { 404 return ''; 405 } 406 407 // Attempt to convert relative URLs to absolute. 408 $parsed_url = parse_url( $url ); 409 if ( isset( $parsed_url['scheme'] ) && isset( $parsed_url['host'] ) ) { 410 $root_url = $parsed_url['scheme'] . '://' . $parsed_url['host'] . '/'; 411 $image = WP_Http::make_absolute_url( $image, $root_url ); 412 } 413 414 return $image; 415 } 416 417 /** 418 * Prepares the metadata by: 419 * - stripping all HTML tags and tag entities. 420 * - converting non-tag entities into characters. 421 * 422 * @since 5.9.0 423 * 424 * @param string $metadata The metadata content to prepare. 425 * @return string The prepared metadata. 426 */ 427 private function prepare_metadata_for_output( $metadata ) { 428 $metadata = html_entity_decode( $metadata, ENT_QUOTES, get_bloginfo( 'charset' ) ); 429 $metadata = wp_strip_all_tags( $metadata ); 430 return $metadata; 431 } 432 433 /** 434 * Utility function to build cache key for a given URL. 435 * 436 * @since 5.9.0 437 * 438 * @param string $url The URL for which to build a cache key. 439 * @return string The cache key. 440 */ 441 private function build_cache_key_for_url( $url ) { 442 return 'g_url_details_response_' . md5( $url ); 443 } 444 445 /** 446 * Utility function to retrieve a value from the cache at a given key. 447 * 448 * @since 5.9.0 449 * 450 * @param string $key The cache key. 451 * @return mixed The value from the cache. 452 */ 453 private function get_cache( $key ) { 454 return get_site_transient( $key ); 455 } 456 457 /** 458 * Utility function to cache a given data set at a given cache key. 459 * 460 * @since 5.9.0 461 * 462 * @param string $key The cache key under which to store the value. 463 * @param string $data The data to be stored at the given cache key. 464 * @return bool True when transient set. False if not set. 465 */ 466 private function set_cache( $key, $data = '' ) { 467 $ttl = HOUR_IN_SECONDS; 468 469 /** 470 * Filters the cache expiration. 471 * 472 * Can be used to adjust the time until expiration in seconds for the cache 473 * of the data retrieved for the given URL. 474 * 475 * @since 5.9.0 476 * 477 * @param int $ttl The time until cache expiration in seconds. 478 */ 479 $cache_expiration = apply_filters( 'rest_url_details_cache_expiration', $ttl ); 480 481 return set_site_transient( $key, $data, $cache_expiration ); 482 } 483 484 /** 485 * Retrieves the head element section. 486 * 487 * @since 5.9.0 488 * 489 * @param string $html The string of HTML to parse. 490 * @return string The `<head>..</head>` section on success. Given `$html` if not found. 491 */ 492 private function get_document_head( $html ) { 493 $head_html = $html; 494 495 // Find the opening `<head>` tag. 496 $head_start = strpos( $html, '<head' ); 497 if ( false === $head_start ) { 498 // Didn't find it. Return the original HTML. 499 return $html; 500 } 501 502 // Find the closing `</head>` tag. 503 $head_end = strpos( $head_html, '</head>' ); 504 if ( false === $head_end ) { 505 // Didn't find it. Find the opening `<body>` tag. 506 $head_end = strpos( $head_html, '<body' ); 507 508 // Didn't find it. Return the original HTML. 509 if ( false === $head_end ) { 510 return $html; 511 } 512 } 513 514 // Extract the HTML from opening tag to the closing tag. Then add the closing tag. 515 $head_html = substr( $head_html, $head_start, $head_end ); 516 $head_html .= '</head>'; 517 518 return $head_html; 519 } 520 521 /** 522 * Gets all the meta tag elements that have a 'content' attribute. 523 * 524 * @since 5.9.0 525 * 526 * @param string $html The string of HTML to be parsed. 527 * @return array { 528 * A multi-dimensional indexed array on success, else empty array. 529 * 530 * @type string[] $0 Meta elements with a content attribute. 531 * @type string[] $1 Content attribute's opening quotation mark. 532 * @type string[] $2 Content attribute's value for each meta element. 533 * } 534 */ 535 private function get_meta_with_content_elements( $html ) { 536 /* 537 * Parse all meta elements with a content attribute. 538 * 539 * Why first search for the content attribute rather than directly searching for name=description element? 540 * tl;dr The content attribute's value will be truncated when it contains a > symbol. 541 * 542 * The content attribute's value (i.e. the description to get) can have HTML in it and be well-formed as 543 * it's a string to the browser. Imagine what happens when attempting to match for the name=description 544 * first. Hmm, if a > or /> symbol is in the content attribute's value, then it terminates the match 545 * as the element's closing symbol. But wait, it's in the content attribute and is not the end of the 546 * element. This is a limitation of using regex. It can't determine "wait a minute this is inside of quotation". 547 * If this happens, what gets matched is not the entire element or all of the content. 548 * 549 * Why not search for the name=description and then content="(.*)"? 550 * The attribute order could be opposite. Plus, additional attributes may exist including being between 551 * the name and content attributes. 552 * 553 * Why not lookahead? 554 * Lookahead is not constrained to stay within the element. The first <meta it finds may not include 555 * the name or content, but rather could be from a different element downstream. 556 */ 557 $pattern = '#<meta\s' . 558 559 /* 560 * Allows for additional attributes before the content attribute. 561 * Searches for anything other than > symbol. 562 */ 563 '[^>]*' . 564 565 /* 566 * Find the content attribute. When found, capture its value (.*). 567 * 568 * Allows for (a) single or double quotes and (b) whitespace in the value. 569 * 570 * Why capture the opening quotation mark, i.e. (["\']), and then backreference, 571 * i.e \1, for the closing quotation mark? 572 * To ensure the closing quotation mark matches the opening one. Why? Attribute values 573 * can contain quotation marks, such as an apostrophe in the content. 574 */ 575 'content=(["\']??)(.*)\1' . 576 577 /* 578 * Allows for additional attributes after the content attribute. 579 * Searches for anything other than > symbol. 580 */ 581 '[^>]*' . 582 583 /* 584 * \/?> searches for the closing > symbol, which can be in either /> or > format. 585 * # ends the pattern. 586 */ 587 '\/?>#' . 588 589 /* 590 * These are the options: 591 * - i : case insensitive 592 * - s : allows newline characters for the . match (needed for multiline elements) 593 * - U means non-greedy matching 594 */ 595 'isU'; 596 597 preg_match_all( $pattern, $html, $elements ); 598 599 return $elements; 600 } 601 602 /** 603 * Gets the metadata from a target meta element. 604 * 605 * @since 5.9.0 606 * 607 * @param array $meta_elements { 608 * A multi-dimensional indexed array on success, else empty array. 609 * 610 * @type string[] $0 Meta elements with a content attribute. 611 * @type string[] $1 Content attribute's opening quotation mark. 612 * @type string[] $2 Content attribute's value for each meta element. 613 * } 614 * @param string $attr Attribute that identifies the element with the target metadata. 615 * @param string $attr_value The attribute's value that identifies the element with the target metadata. 616 * @return string The metadata on success. Empty string if not found. 617 */ 618 private function get_metadata_from_meta_element( $meta_elements, $attr, $attr_value ) { 619 // Bail out if there are no meta elements. 620 if ( empty( $meta_elements[0] ) ) { 621 return ''; 622 } 623 624 $metadata = ''; 625 $pattern = '#' . 626 /* 627 * Target this attribute and value to find the metadata element. 628 * 629 * Allows for (a) no, single, double quotes and (b) whitespace in the value. 630 * 631 * Why capture the opening quotation mark, i.e. (["\']), and then backreference, 632 * i.e \1, for the closing quotation mark? 633 * To ensure the closing quotation mark matches the opening one. Why? Attribute values 634 * can contain quotation marks, such as an apostrophe in the content. 635 */ 636 $attr . '=([\"\']??)\s*' . $attr_value . '\s*\1' . 637 638 /* 639 * These are the options: 640 * - i : case insensitive 641 * - s : allows newline characters for the . match (needed for multiline elements) 642 * - U means non-greedy matching 643 */ 644 '#isU'; 645 646 // Find the metadata element. 647 foreach ( $meta_elements[0] as $index => $element ) { 648 preg_match( $pattern, $element, $match ); 649 650 // This is not the metadata element. Skip it. 651 if ( empty( $match ) ) { 652 continue; 653 } 654 655 /* 656 * Found the metadata element. 657 * Get the metadata from its matching content array. 658 */ 659 if ( isset( $meta_elements[2][ $index ] ) && is_string( $meta_elements[2][ $index ] ) ) { 660 $metadata = trim( $meta_elements[2][ $index ] ); 661 } 662 663 break; 664 } 665 666 return $metadata; 667 } 668 }
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
Generated: Mon Sep 9 01:00:02 2024 | Cross-referenced by PHPXref 0.7.1 |