chrome/renderer/safe_browsing/phishing_classifier.cc

   1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "chrome/renderer/safe_browsing/phishing_classifier.h"
   6
   7 #include <string>
   8
   9 #include "base/bind.h"
  10 #include "base/callback.h"
  11 #include "base/compiler_specific.h"
  12 #include "base/logging.h"
  13 #include "base/message_loop.h"
  14 #include "base/metrics/histogram.h"
  15 #include "base/string_util.h"
  16 #include "chrome/common/safe_browsing/csd.pb.h"
  17 #include "chrome/common/url_constants.h"
  18 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h"
  19 #include "chrome/renderer/safe_browsing/features.h"
  20 #include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h"
  21 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"
  22 #include "chrome/renderer/safe_browsing/phishing_url_feature_extractor.h"
  23 #include "chrome/renderer/safe_browsing/scorer.h"
  24 #include "content/public/renderer/render_view.h"
  25 #include "crypto/sha2.h"
  26 #include "googleurl/src/gurl.h"
  27 #include "third_party/WebKit/Source/WebKit/chromium/public/WebDataSource.h"
  28 #include "third_party/WebKit/Source/WebKit/chromium/public/WebDocument.h"
  29 #include "third_party/WebKit/Source/WebKit/chromium/public/WebFrame.h"
  30 #include "third_party/WebKit/Source/WebKit/chromium/public/platform/WebURL.h"
  31 #include "third_party/WebKit/Source/WebKit/chromium/public/platform/WebURLRequest.h"
  32 #include "third_party/WebKit/Source/WebKit/chromium/public/WebView.h"
  33
  34 namespace safe_browsing {
  35
  36 const float PhishingClassifier::kInvalidScore = -1.0;
  37 const float PhishingClassifier::kPhishyThreshold = 0.5;
  38
  39 PhishingClassifier::PhishingClassifier(content::RenderView* render_view,
  40                                        FeatureExtractorClock* clock)
  41     : render_view_(render_view),
  42       scorer_(NULL),
  43       clock_(clock),
  44       ALLOW_THIS_IN_INITIALIZER_LIST(weak_factory_(this)) {
  45   Clear();
  46 }
  47
  48 PhishingClassifier::~PhishingClassifier() {
  49   // The RenderView should have called CancelPendingClassification() before
  50   // we are destroyed.
  51   CheckNoPendingClassification();
  52 }
  53
  54 void PhishingClassifier::set_phishing_scorer(const Scorer* scorer) {
  55   CheckNoPendingClassification();
  56   scorer_ = scorer;
  57   if (scorer_) {
  58     url_extractor_.reset(new PhishingUrlFeatureExtractor);
  59     dom_extractor_.reset(
  60         new PhishingDOMFeatureExtractor(render_view_, clock_.get()));
  61     term_extractor_.reset(new PhishingTermFeatureExtractor(
  62         &scorer_->page_terms(),
  63         &scorer_->page_words(),
  64         scorer_->max_words_per_term(),
  65         scorer_->murmurhash3_seed(),
  66         clock_.get()));
  67   } else {
  68     // We're disabling client-side phishing detection, so tear down all
  69     // of the relevant objects.
  70     url_extractor_.reset();
  71     dom_extractor_.reset();
  72     term_extractor_.reset();
  73   }
  74 }
  75
  76 bool PhishingClassifier::is_ready() const {
  77   return scorer_ != NULL;
  78 }
  79
  80 void PhishingClassifier::BeginClassification(
  81     const string16* page_text,
  82     const DoneCallback& done_callback) {
  83   DCHECK(is_ready());
  84
  85   // The RenderView should have called CancelPendingClassification() before
  86   // starting a new classification, so DCHECK this.
  87   CheckNoPendingClassification();
  88   // However, in an opt build, we will go ahead and clean up the pending
  89   // classification so that we can start in a known state.
  90   CancelPendingClassification();
  91
  92   page_text_ = page_text;
  93   done_callback_ = done_callback;
  94
  95   // For consistency, we always want to invoke the DoneCallback
  96   // asynchronously, rather than directly from this method.  To ensure that
  97   // this is the case, post a task to begin feature extraction on the next
  98   // iteration of the message loop.
  99   MessageLoop::current()->PostTask(
 100       FROM_HERE,
 101       base::Bind(&PhishingClassifier::BeginFeatureExtraction,
 102                  weak_factory_.GetWeakPtr()));
 103 }
 104
 105 void PhishingClassifier::BeginFeatureExtraction() {
 106   WebKit::WebView* web_view = render_view_->GetWebView();
 107   if (!web_view) {
 108     RunFailureCallback();
 109     return;
 110   }
 111
 112   WebKit::WebFrame* frame = web_view->mainFrame();
 113   if (!frame) {
 114     RunFailureCallback();
 115     return;
 116   }
 117
 118   // Check whether the URL is one that we should classify.
 119   // Currently, we only classify http: URLs that are GET requests.
 120   GURL url(frame->document().url());
 121   if (!url.SchemeIs(chrome::kHttpScheme)) {
 122     RunFailureCallback();
 123     return;
 124   }
 125
 126   WebKit::WebDataSource* ds = frame->dataSource();
 127   if (!ds || !EqualsASCII(ds->request().httpMethod(), "GET")) {
 128     RunFailureCallback();
 129     return;
 130   }
 131
 132   features_.reset(new FeatureMap);
 133   if (!url_extractor_->ExtractFeatures(url, features_.get())) {
 134     RunFailureCallback();
 135     return;
 136   }
 137
 138   // DOM feature extraction can take awhile, so it runs asynchronously
 139   // in several chunks of work and invokes the callback when finished.
 140   dom_extractor_->ExtractFeatures(
 141       features_.get(),
 142       base::Bind(&PhishingClassifier::DOMExtractionFinished,
 143                  base::Unretained(this)));
 144 }
 145
 146 void PhishingClassifier::CancelPendingClassification() {
 147   // Note that cancelling the feature extractors is simply a no-op if they
 148   // were not running.
 149   DCHECK(is_ready());
 150   dom_extractor_->CancelPendingExtraction();
 151   term_extractor_->CancelPendingExtraction();
 152   weak_factory_.InvalidateWeakPtrs();
 153   Clear();
 154 }
 155
 156 void PhishingClassifier::DOMExtractionFinished(bool success) {
 157   if (success) {
 158     // Term feature extraction can take awhile, so it runs asynchronously
 159     // in several chunks of work and invokes the callback when finished.
 160     term_extractor_->ExtractFeatures(
 161         page_text_,
 162         features_.get(),
 163         base::Bind(&PhishingClassifier::TermExtractionFinished,
 164                    base::Unretained(this)));
 165   } else {
 166     RunFailureCallback();
 167   }
 168 }
 169
 170 void PhishingClassifier::TermExtractionFinished(bool success) {
 171   if (success) {
 172     WebKit::WebView* web_view = render_view_->GetWebView();
 173     if (!web_view) {
 174       RunFailureCallback();
 175       return;
 176     }
 177     WebKit::WebFrame* main_frame = web_view->mainFrame();
 178     if (!main_frame) {
 179       RunFailureCallback();
 180       return;
 181     }
 182
 183     // Hash all of the features so that they match the model, then compute
 184     // the score.
 185     FeatureMap hashed_features;
 186     ClientPhishingRequest verdict;
 187     verdict.set_model_version(scorer_->model_version());
 188     verdict.set_url(main_frame->document().url().spec());
 189     for (base::hash_map<std::string, double>::const_iterator it =
 190              features_->features().begin();
 191          it != features_->features().end(); ++it) {
 192       VLOG(2) << "Feature: " << it->first << " = " << it->second;
 193       bool result = hashed_features.AddRealFeature(
 194           crypto::SHA256HashString(it->first), it->second);
 195       DCHECK(result);
 196       ClientPhishingRequest::Feature* feature = verdict.add_feature_map();
 197       feature->set_name(it->first);
 198       feature->set_value(it->second);
 199     }
 200     float score = static_cast<float>(scorer_->ComputeScore(hashed_features));
 201     verdict.set_client_score(score);
 202     verdict.set_is_phishing(score >= kPhishyThreshold);
 203     RunCallback(verdict);
 204   } else {
 205     RunFailureCallback();
 206   }
 207 }
 208
 209 void PhishingClassifier::CheckNoPendingClassification() {
 210   DCHECK(done_callback_.is_null());
 211   DCHECK(!page_text_);
 212   if (!done_callback_.is_null() || page_text_) {
 213     LOG(ERROR) << "Classification in progress, missing call to "
 214                << "CancelPendingClassification";
 215     UMA_HISTOGRAM_COUNTS("SBClientPhishing.CheckNoPendingClassificationFailed",
 216                          1);
 217   }
 218 }
 219
 220 void PhishingClassifier::RunCallback(const ClientPhishingRequest& verdict) {
 221   done_callback_.Run(verdict);
 222   Clear();
 223 }
 224
 225 void PhishingClassifier::RunFailureCallback() {
 226   ClientPhishingRequest verdict;
 227   // In this case we're not guaranteed to have a valid URL.  Just set it
 228   // to the empty string to make sure we have a valid protocol buffer.
 229   verdict.set_url("");
 230   verdict.set_client_score(kInvalidScore);
 231   verdict.set_is_phishing(false);
 232   RunCallback(verdict);
 233 }
 234
 235 void PhishingClassifier::Clear() {
 236   page_text_ = NULL;
 237   done_callback_.Reset();
 238   features_.reset(NULL);
 239 }
 240
 241 }  // namespace safe_browsing