chromium/ios/chrome/browser/reading_list/model/reading_list_distiller_page.mm

// Copyright 2016 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#import "ios/chrome/browser/reading_list/model/reading_list_distiller_page.h"

#import "base/apple/foundation_util.h"
#import "base/functional/bind.h"
#import "base/strings/string_util.h"
#import "base/strings/sys_string_conversions.h"
#import "base/strings/utf_string_conversions.h"
#import "base/task/single_thread_task_runner.h"
#import "base/time/time.h"
#import "base/values.h"
#import "components/favicon/ios/web_favicon_driver.h"
#import "components/google/core/common/google_util.h"
#import "ios/chrome/browser/reading_list/model/favicon_web_state_dispatcher_impl.h"
#import "ios/chrome/browser/shared/ui/util/uikit_ui_util.h"
#import "ios/web/public/js_messaging/web_frame.h"
#import "ios/web/public/js_messaging/web_frames_manager.h"
#import "ios/web/public/navigation/navigation_item.h"
#import "ios/web/public/navigation/navigation_manager.h"
#import "ios/web/public/security/ssl_status.h"
#import "ios/web/public/web_state.h"
#import "net/base/apple/url_conversions.h"
#import "net/cert/cert_status_flags.h"
#import "url/url_constants.h"

namespace {
// The delay given to the web page to render after the PageLoaded callback.
constexpr base::TimeDelta kPageLoadDelay = base::Seconds(2);

// This script retrieve the href parameter of the <link rel="amphtml"> element
// of the page if it exists. If it does not exist, it returns the src of the
// first iframe of the page.
const char16_t* kGetIframeURLJavaScript =
    u"(() => {"
    "  var link = document.evaluate('//link[@rel=\"amphtml\"]',"
    "                               document,"
    "                               null,"
    "                               XPathResult.ORDERED_NODE_SNAPSHOT_TYPE,"
    "                               null ).snapshotItem(0);"
    "  if (link !== null) {"
    "    return link.getAttribute('href');"
    "  }"
    "  return document.getElementsByTagName('iframe')[0].src;"
    "})()";

const char16_t* kWikipediaWorkaround =
    u"(() => {"
    "  var s = document.createElement('style');"
    "  s.innerHTML='.client-js .collapsible-block { display: block }';"
    "  document.head.appendChild(s);"
    "})()";
}  // namespace

namespace reading_list {

ReadingListDistillerPageDelegate::ReadingListDistillerPageDelegate() {}
ReadingListDistillerPageDelegate::~ReadingListDistillerPageDelegate() {}

ReadingListDistillerPage::ReadingListDistillerPage(
    const GURL& url,
    web::BrowserState* browser_state,
    FaviconWebStateDispatcher* web_state_dispatcher,
    ReadingListDistillerPageDelegate* delegate)
    : dom_distiller::DistillerPageIOS(browser_state),
      original_url_(url),
      web_state_dispatcher_(web_state_dispatcher),
      delegate_(delegate),
      delayed_task_id_(0),
      weak_ptr_factory_(this) {
  DCHECK(delegate);
}

ReadingListDistillerPage::~ReadingListDistillerPage() {}

void ReadingListDistillerPage::DistillPageImpl(const GURL& url,
                                               const std::string& script) {
  std::unique_ptr<web::WebState> old_web_state = DetachWebState();
  if (old_web_state) {
    web_state_dispatcher_->ReturnWebState(std::move(old_web_state));
  }
  std::unique_ptr<web::WebState> new_web_state =
      web_state_dispatcher_->RequestWebState();
  AttachWebState(std::move(new_web_state));

  delayed_task_id_++;
  distilling_main_page_ = url == original_url_;
  FetchFavicon(url);

  DistillerPageIOS::DistillPageImpl(url, script);

  // WKWebView sets the document.hidden property to true and the
  // document.visibilityState to prerender if the page is not added to a view
  // hierarchy. Some pages may not render their content in these conditions.
  // Add the view and move it out of the screen far in the top left corner of
  // the coordinate space.
  CGRect frame = [GetAnyKeyWindow() frame];
  frame.origin.x = -5 * std::max(frame.size.width, frame.size.height);
  frame.origin.y = frame.origin.x;
  DCHECK(![CurrentWebState()->GetView() superview]);
  [CurrentWebState()->GetView() setFrame:frame];
  [GetAnyKeyWindow() insertSubview:CurrentWebState()->GetView() atIndex:0];
}

void ReadingListDistillerPage::FetchFavicon(const GURL& page_url) {
  if (!CurrentWebState() || !page_url.is_valid()) {
    return;
  }
  favicon::WebFaviconDriver* favicon_driver =
      favicon::WebFaviconDriver::FromWebState(CurrentWebState());
  DCHECK(favicon_driver);
  favicon_driver->FetchFavicon(page_url, /*is_same_document=*/false);
}

void ReadingListDistillerPage::OnDistillationDone(const GURL& page_url,
                                                  const base::Value* value) {
  std::unique_ptr<web::WebState> old_web_state = DetachWebState();
  if (old_web_state) {
    [old_web_state->GetView() removeFromSuperview];
    web_state_dispatcher_->ReturnWebState(std::move(old_web_state));
  }
  delayed_task_id_++;
  DistillerPageIOS::OnDistillationDone(page_url, value);
}

bool ReadingListDistillerPage::IsLoadingSuccess(
    web::PageLoadCompletionStatus load_completion_status) {
  if (load_completion_status != web::PageLoadCompletionStatus::SUCCESS) {
    return false;
  }
  if (!CurrentWebState() || !CurrentWebState()->GetNavigationManager() ||
      !CurrentWebState()->GetNavigationManager()->GetLastCommittedItem()) {
    // Only distill fully loaded, committed pages. If the page was not fully
    // loaded, web::PageLoadCompletionStatus::FAILURE should have been passed to
    // OnLoadURLDone. But check that the item exist before using it anyway.
    return false;
  }
  web::NavigationItem* item =
      CurrentWebState()->GetNavigationManager()->GetLastCommittedItem();
  if (!item->GetURL().SchemeIsCryptographic()) {
    // HTTP is allowed.
    return true;
  }

  // On SSL connections, check there was no error.
  const web::SSLStatus& ssl_status = item->GetSSL();
  if (net::IsCertStatusError(ssl_status.cert_status)) {
    return false;
  }
  return true;
}

void ReadingListDistillerPage::OnLoadURLDone(
    web::PageLoadCompletionStatus load_completion_status) {
  if (!IsLoadingSuccess(load_completion_status)) {
    DistillerPageIOS::OnLoadURLDone(load_completion_status);
    return;
  }
  if (distilling_main_page_) {
    delegate_->DistilledPageHasMimeType(
        original_url_, CurrentWebState()->GetContentsMimeType());
  }
  if (!CurrentWebState()->ContentIsHTML()) {
    // If content is not HTML, distillation will fail immediately.
    // Call the handler to make sure cleaning methods are called correctly.
    // There is no need to wait for rendering either.
    DistillerPageIOS::OnLoadURLDone(load_completion_status);
    return;
  }
  FetchFavicon(CurrentWebState()->GetVisibleURL());

  // Page is loaded but rendering may not be done yet. Give a delay to the page.
  base::WeakPtr<ReadingListDistillerPage> weak_this =
      weak_ptr_factory_.GetWeakPtr();
  base::SingleThreadTaskRunner::GetCurrentDefault()->PostDelayedTask(
      FROM_HERE,
      base::BindOnce(&ReadingListDistillerPage::DelayedOnLoadURLDone, weak_this,
                     delayed_task_id_),
      kPageLoadDelay);
}

void ReadingListDistillerPage::DelayedOnLoadURLDone(int delayed_task_id) {
  if (!CurrentWebState() || delayed_task_id != delayed_task_id_) {
    // Something interrupted the distillation.
    // Abort here.
    return;
  }
  if (IsGoogleCachedAMPPage()) {
    // Workaround for Google AMP pages.
    HandleGoogleCachedAMPPage();
    return;
  }
  if (IsWikipediaPage()) {
    // Workaround for Wikipedia pages.
    // TODO(crbug.com/40485232): remove workaround once DOM distiller handle
    // this case.
    HandleWikipediaPage();
    return;
  }
  ContinuePageDistillation();
}

void ReadingListDistillerPage::ContinuePageDistillation() {
  if (!CurrentWebState()) {
    // Something interrupted the distillation.
    // Abort here.
    return;
  }
  // The page is ready to be distilled.
  // If the visible URL is not the original URL, notify the caller that URL
  // changed.
  GURL redirected_url = CurrentWebState()->GetVisibleURL();
  if (redirected_url != original_url_ && delegate_ && distilling_main_page_) {
    delegate_->DistilledPageRedirectedToURL(original_url_, redirected_url);
  }
  DistillerPageIOS::OnLoadURLDone(web::PageLoadCompletionStatus::SUCCESS);
}

bool ReadingListDistillerPage::IsGoogleCachedAMPPage() {
  // All google AMP pages have URL in the form "https://google_domain/amp/..."
  // and a valid certificate.
  // This method checks that this is strictly the case.
  const GURL& url = CurrentWebState()->GetLastCommittedURL();
  if (!url.is_valid() || !url.SchemeIs(url::kHttpsScheme)) {
    return false;
  }
  if (!google_util::IsGoogleDomainUrl(
          url, google_util::DISALLOW_SUBDOMAIN,
          google_util::DISALLOW_NON_STANDARD_PORTS) ||
      !url.path().compare(0, 4, "amp/")) {
    return false;
  }
  const web::SSLStatus& ssl_status = CurrentWebState()
                                         ->GetNavigationManager()
                                         ->GetLastCommittedItem()
                                         ->GetSSL();
  if (!ssl_status.certificate ||
      net::IsCertStatusError(ssl_status.cert_status)) {
    return false;
  }

  return true;
}

void ReadingListDistillerPage::HandleGoogleCachedAMPPage() {
  web::WebState* web_state = CurrentWebState();
  if (!web_state) {
    return;
  }
  web::WebFrame* web_frame =
      web_state->GetPageWorldWebFramesManager()->GetMainWebFrame();
  if (!web_frame) {
    return;
  }
  web_frame->ExecuteJavaScript(
      kGetIframeURLJavaScript,
      base::BindOnce(
          &ReadingListDistillerPage::OnHandleGoogleCachedAMPPageResult,
          weak_ptr_factory_.GetWeakPtr()));
}

void ReadingListDistillerPage::OnHandleGoogleCachedAMPPageResult(
    const base::Value* value,
    NSError* error) {
  if (!error && value->is_string()) {
    GURL new_gurl(value->GetString());
    if (new_gurl.is_valid()) {
      FetchFavicon(new_gurl);
      web::NavigationManager::WebLoadParams params(new_gurl);
      CurrentWebState()->GetNavigationManager()->LoadURLWithParams(params);

      // If there is no error, the navigation completion will
      // trigger a new `OnLoadURLDone` call that will resume
      // the distillation.
      return;
    }
  }

  // If there is an error on navigation, continue
  // normal distillation.
  ContinuePageDistillation();
}

bool ReadingListDistillerPage::IsWikipediaPage() {
  // All wikipedia pages are in the form "https://xxx.m.wikipedia.org/..."
  const GURL& url = CurrentWebState()->GetLastCommittedURL();
  if (!url.is_valid() || !url.SchemeIs(url::kHttpsScheme)) {
    return false;
  }
  return (base::EndsWith(url.host(), ".m.wikipedia.org",
                         base::CompareCase::SENSITIVE));
}

void ReadingListDistillerPage::HandleWikipediaPage() {
  web::WebState* web_state = CurrentWebState();
  if (!web_state) {
    return;
  }
  web::WebFrame* web_frame =
      web_state->GetPageWorldWebFramesManager()->GetMainWebFrame();
  if (!web_frame) {
    return;
  }
  web_frame->ExecuteJavaScript(
      kWikipediaWorkaround,
      BindOnce(&ReadingListDistillerPage::OnHandleWikipediaPageResult,
               weak_ptr_factory_.GetWeakPtr()));
}

void ReadingListDistillerPage::OnHandleWikipediaPageResult(
    const base::Value* value) {
  ContinuePageDistillation();
}

}  // namespace reading_list