Fixes option-right in textfields when VoiceOver is set to read to the right of the...
[chromium-blink-merge.git] / chrome / renderer / safe_browsing / features.h
bloba8067f7e01cb33ff55a98dfae2414306d7f89083
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 //
5 // Common types and constants for extracting and evaluating features in the
6 // client-side phishing detection model. A feature is simply a string and an
7 // associated floating-point value between 0 and 1. The phishing
8 // classification model contains rules which give an appropriate weight to each
9 // feature or combination of features. These values can then be summed to
10 // compute a final phishiness score.
12 // Some features are boolean features. If these features are set, they always
13 // have a value of 0.0 or 1.0. In practice, the features are only set if the
14 // value is true (1.0).
16 // We also use token features. These features have a unique name that is
17 // constructed from the URL or page contents that we are classifying, for
18 // example, "UrlDomain=chromium". These features are also always set to 1.0
19 // if they are present.
21 // The intermediate storage of the features for a URL is a FeatureMap, which is
22 // just a thin wrapper around a map of feature name to value. The entire set
23 // of features for a URL is extracted before we do any scoring.
25 #ifndef CHROME_RENDERER_SAFE_BROWSING_FEATURES_H_
26 #define CHROME_RENDERER_SAFE_BROWSING_FEATURES_H_
27 #pragma once
29 #include <string>
30 #include "base/basictypes.h"
31 #include "base/hash_tables.h"
33 namespace safe_browsing {
35 // Container for a map of features to values, which enforces behavior
36 // such as a maximum number of features in the map.
37 class FeatureMap {
38 public:
39 FeatureMap();
40 ~FeatureMap();
42 // Adds a boolean feature to a FeatureMap with a value of 1.0.
43 // Returns true on success, or false if the feature map exceeds
44 // kMaxFeatureMapSize.
45 bool AddBooleanFeature(const std::string& name);
47 // Adds a real-valued feature to a FeatureMap with the given value.
48 // Values must always be in the range [0.0, 1.0]. Returns true on
49 // success, or false if the feature map exceeds kMaxFeatureMapSize
50 // or the value is outside of the allowed range.
51 bool AddRealFeature(const std::string& name, double value);
53 // Provides read-only access to the current set of features.
54 const base::hash_map<std::string, double>& features() const {
55 return features_;
58 // Clears the set of features in the map.
59 void Clear();
61 // This is an upper bound on the number of features that will be extracted.
62 // We should never hit this cap; it is intended as a sanity check to prevent
63 // the FeatureMap from growing too large.
64 static const size_t kMaxFeatureMapSize;
66 private:
67 base::hash_map<std::string, double> features_;
69 DISALLOW_COPY_AND_ASSIGN(FeatureMap);
72 namespace features {
73 // Constants for the various feature names that we use.
75 // IMPORTANT: when adding new features, you must update kAllowedFeatures in
76 // chrome/browser/safe_browsing/client_side_detection_service.cc if the feature
77 // should be sent in sanitized pingbacks.
79 ////////////////////////////////////////////////////
80 // URL host features
81 ////////////////////////////////////////////////////
83 // Set if the URL's hostname is an IP address.
84 extern const char kUrlHostIsIpAddress[];
85 // Token feature containing the portion of the hostname controlled by a
86 // registrar, for example "com" or "co.uk".
87 extern const char kUrlTldToken[];
88 // Token feature containing the first host component below the registrar.
89 // For example, in "www.google.com", the domain would be "google".
90 extern const char kUrlDomainToken[];
91 // Token feature containing each host component below the domain.
92 // For example, in "www.host.example.com", both "www" and "host" would be
93 // "other host tokens".
94 extern const char kUrlOtherHostToken[];
96 ////////////////////////////////////////////////////
97 // Aggregate features for URL host tokens
98 ////////////////////////////////////////////////////
100 // Set if the number of "other" host tokens for a URL is greater than one.
101 // Longer hostnames, regardless of the specific tokens, can be a signal that
102 // the URL is phishy.
103 extern const char kUrlNumOtherHostTokensGTOne[];
104 // Set if the number of "other" host tokens for a URL is greater than three.
105 extern const char kUrlNumOtherHostTokensGTThree[];
107 ////////////////////////////////////////////////////
108 // URL path token features
109 ////////////////////////////////////////////////////
111 // Token feature containing each alphanumeric string in the path that is at
112 // least 3 characters long. For example, "/abc/d/efg" would have 2 path
113 // token features, "abc" and "efg". Query parameters are not included.
114 extern const char kUrlPathToken[];
116 ////////////////////////////////////////////////////
117 // DOM HTML form features
118 ////////////////////////////////////////////////////
120 // Set if the page has any <form> elements.
121 extern const char kPageHasForms[];
122 // The fraction of form elements whose |action| attribute points to a
123 // URL on a different domain from the document URL.
124 extern const char kPageActionOtherDomainFreq[];
126 // Set if the page has any <input type="text"> elements
127 // (includes inputs with missing or unknown types).
128 extern const char kPageHasTextInputs[];
129 // Set if the page has any <input type="password"> elements.
130 extern const char kPageHasPswdInputs[];
131 // Set if the page has any <input type="radio"> elements.
132 extern const char kPageHasRadioInputs[];
133 // Set if the page has any <input type="checkbox"> elements.
134 extern const char kPageHasCheckInputs[];
136 ////////////////////////////////////////////////////
137 // DOM HTML link features
138 ////////////////////////////////////////////////////
140 // The fraction of links in the page which point to a domain other than the
141 // domain of the document. See "URL host features" above for a discussion
142 // of how the doamin is computed.
143 extern const char kPageExternalLinksFreq[];
144 // Token feature containing each external domain that is linked to.
145 extern const char kPageLinkDomain[];
146 // Fraction of links in the page that use https.
147 extern const char kPageSecureLinksFreq[];
149 ////////////////////////////////////////////////////
150 // DOM HTML script features
151 ////////////////////////////////////////////////////
153 // Set if the number of <script> elements in the page is greater than 1.
154 extern const char kPageNumScriptTagsGTOne[];
155 // Set if the number of <script> elements in the page is greater than 6.
156 extern const char kPageNumScriptTagsGTSix[];
158 ////////////////////////////////////////////////////
159 // Other DOM HTML features
160 ////////////////////////////////////////////////////
162 // The fraction of images whose src attribute points to an external domain.
163 extern const char kPageImgOtherDomainFreq[];
165 ////////////////////////////////////////////////////
166 // Page term features
167 ////////////////////////////////////////////////////
169 // Token feature for a term (whitespace-delimited) on a page. Terms can be
170 // single words or multi-word n-grams. Rather than adding this feature for
171 // every possible token on a page, only the terms that are mentioned in the
172 // classification model are added.
173 extern const char kPageTerm[];
175 } // namespace features
176 } // namepsace safe_browsing
178 #endif // CHROME_RENDERER_SAFE_BROWSING_FEATURES_H_