hphp/util/extern-worker.h

   1 /*
   2    +----------------------------------------------------------------------+
   3    | HipHop for PHP                                                       |
   4    +----------------------------------------------------------------------+
   5    | Copyright (c) 2010-present Facebook, Inc. (http://www.facebook.com)  |
   6    +----------------------------------------------------------------------+
   7    | This source file is subject to version 3.01 of the PHP license,      |
   8    | that is bundled with this package in the file LICENSE, and is        |
   9    | available through the world-wide-web at the following url:           |
  10    | http://www.php.net/license/3_01.txt                                  |
  11    | If you did not receive a copy of the PHP license and are unable to   |
  12    | obtain it through the world-wide-web, please send a note to          |
  13    | license@php.net so we can mail you a copy immediately.               |
  14    +----------------------------------------------------------------------+
  15 */
  16
  17 #pragma once
  18
  19 #include "hphp/util/coro.h"
  20 #include "hphp/util/optional.h"
  21 #include "hphp/util/trace.h"
  22
  23 #include <filesystem>
  24 #include <string>
  25 #include <vector>
  26
  27 #include <boost/variant.hpp>
  28
  29 /*
  30  * Framework for executing work outside of the HHVM process.
  31  *
  32  * The main use case for this framework is to provide a mechanism to
  33  * execute work in a distributed manner on remote machines. This can
  34  * allow for greater parallelism than just running locally, and allow
  35  * for datasets which exceed the memory available on any one
  36  * machine. However, it has been designed to be agnostic (as much as
  37  * possible) as to the exact mechanism used for execution, meaning it
  38  * can be expanded for other use cases we forsee in the future.
  39  *
  40  * Terminology:
  41  *
  42  * "Job" - Encapsulate some piece of work. A Job has a typed set of
  43  * inputs, and produces a typed set of outputs. Multiple instances of
  44  * the same job can be run in one "execution" (with different
  45  * inputs). A job also has a separate set of "init" inputs, which are
  46  * run once per execution (used for global initialization). Exactly
  47  * where/how the job "executes" is not specified (depends on
  48  * implementation and config), but it will be outside of the
  49  * process. Some implementations may cache the results of an
  50  * execution, meaning that it can produce the output without having to
  51  * actually run the job.
  52  *
  53  * "Ref" - Refs represent some piece of data. You do not provide
  54  * inputs or read outputs from a job directly. Instead these are
  55  * represented by refs to the data. You "store" a piece of data to
  56  * obtain its ref (which can then be provided to a job
  57  * execution). Likewise, given a ref, you can "load" it to obtain the
  58  * data. This lets you feed data from one job to the next without
  59  * having to explicitly load it. Refs are type-safe. They know which
  60  * type of data they point to, meaning you know (at compile time) that
  61  * you're passing the right kind of data to a job). Refs contain a
  62  * "RefId", which is a string/size pair. This uniquely identifies the
  63  * data, but the meaning of the pair is up to the implementation. Any
  64  * data stored needs to be serializable with BlobEncoder/BlobDecoder.
  65  *
  66  * "Markers" - In some situations, you want to be able to represent a
  67  * variadic list of data, or an optional piece of data. You could use
  68  * std::vector or Optional for this, but you would get a
  69  * Ref<std::vector>, which means a ref to a std::vector. It might be
  70  * more useful to have a std::vector<Ref>, which means you know how
  71  * many refs were produced without having to load everything. A
  72  * special set of "marker" types can be used for this. They're
  73  * "Variadic", "Opt", and "Multi", which have similar meanings as
  74  * std::vector, Optional, and std::tuple, but are new types to avoid
  75  * ambiguity. These can only be used as input/outputs of jobs. Multi
  76  * can only be used as a return type.
  77  *
  78  * "Client" - Represents an instance of a extern-worker
  79  * framework. Responsible for producing/consuming refs, and executing
  80  * jobs. A client is backed by a particular implementation, which does
  81  * the actual work. The default implementation uses fork+exec, but
  82  * others can be provided by a hook mechanism.
  83  *
  84  * To use the extern-worker framework, you must provide a handler in
  85  * your main() function. If passed extern_worker::s_option as argv[1],
  86  * call extern_worker::main(), passing in the argv and argc.
  87  */
  88
  89 //////////////////////////////////////////////////////////////////////
  90
  91 // Implementation details to avoid cluttering interface
  92 #define incl_HPHP_EXTERN_WORKER_DETAIL_H_
  93 #include "hphp/util/extern-worker-detail.h"
  94 #undef incl_HPHP_EXTERN_WORKER_DETAIL_H_
  95
  96 //////////////////////////////////////////////////////////////////////
  97
  98 namespace HPHP::extern_worker {
  99
 100 //////////////////////////////////////////////////////////////////////
 101
 102 extern const char* const s_option;
 103
 104 // Entry point for workers
 105 extern int main(int argc, char** argv);
 106
 107 //////////////////////////////////////////////////////////////////////
 108
 109 // Thrown by any of extern-worker functions to indicate an error
 110 struct Error : public std::runtime_error {
 111   using std::runtime_error::runtime_error;
 112 };
 113
 114 // Thrown by some implementations if the backend is busy. Depending on
 115 // configuration, we might retry the action automatically.
 116 struct Throttle : public Error {
 117   using Error::Error;
 118 };
 119
 120 //////////////////////////////////////////////////////////////////////
 121
 122 /*
 123  * Represents a job which can be executed. The implementation of the
 124  * job is provided by a separate class, which Job is instantiated on.
 125  * That class is meant to provide 4 static member functions:
 126  *
 127  * std::string name() - Returns the name of the job. There's no
 128  * restrictions on the name, but it must be globally unique across all
 129  * Jobs in the executable.
 130  *
 131  * void init(<inputs>) - Called once per worker invocation. Used to
 132  * set up global state before run() is called.
 133  *
 134  * <outputs> run(<inputs>) - Run the job. This may be called multiple
 135  * times per work invocation (with different inputs). init() will only
 136  * be called once beforehand.
 137  *
 138  * void fini() - Called once when work is cleaning up. Meant to tear
 139  * down any global state.
 140  *
 141  * The inputs of init() and the inputs/outputs of run() can be any set
 142  * of types which are blob serializable/deserializable, in addition to
 143  * the special marker types (described below). run() normally returns
 144  * one type. If you want to return multiple types, use Multi<>.
 145  *
 146  * To create a Job, instantiate it with the appropriate class and
 147  * declare a static instance of it. The Job *must* have static
 148  * lifetime.
 149  */
 150 template <typename C>
 151 struct Job : public detail::JobBase {
 152   Job();
 153
 154   // The inferred Ref types from the declared inputs/outputs of C's
 155   // member functions.
 156   using ConfigT = typename detail::ConfigRefs<C>::type;
 157   using InputsT = typename detail::InputRefs<C>::type;
 158   using ReturnT = typename detail::ReturnRefs<C>::type;
 159   using FiniT   = typename detail::FiniRefs<C>::type;
 160   using ExecT   = typename detail::ExecRet<C>::type;
 161
 162 private:
 163   void init(const std::filesystem::path&) const override;
 164   void fini(const std::filesystem::path&) const override;
 165   void run(const std::filesystem::path&,
 166            const std::filesystem::path&) const override;
 167 };
 168
 169 //////////////////////////////////////////////////////////////////////
 170
 171 // "Marker" types. These are used to control how types are mapped to
 172 // Refs. Useful, for example, of returning a vector of Refs instead of
 173 // a Ref of a vector. Note that marker types never appear within a
 174 // Ref.
 175
 176 // By default: T -> Ref<T>
 177
 178 template <typename T>
 179 struct Variadic {
 180   // Variadic<T> -> std::vector<Ref<T>>
 181   using Type = T;
 182   std::vector<T> vals;
 183 };
 184
 185 template <typename T>
 186 struct Opt {
 187   // Opt<T> -> Optional<Ref<T>>
 188   using Type = T;
 189   Optional<T> val;
 190 };
 191
 192 // Multi is only valid as a return type.
 193 template <typename... Ts>
 194 struct Multi {
 195   // Multi<T1, T2, ...> -> std::tuple<Ref<T1>, Ref<T2>, ....>
 196   /* implicit */ template <typename... Us> Multi(std::tuple<Us...> t)
 197     : vals{std::move(t)} {}
 198   std::tuple<Ts...> vals;
 199 };
 200
 201 //////////////////////////////////////////////////////////////////////
 202
 203 // Identifier for a Ref. Used by the implementation to track them. The
 204 // meaning of the identifier is private to the implementation.
 205 struct RefId {
 206   RefId(std::string, size_t);
 207
 208   std::string toString() const;
 209   bool operator==(const RefId&) const;
 210   bool operator!=(const RefId&) const;
 211
 212   // Despite their names, these fields can be used for anything.
 213   std::string m_id;
 214   size_t m_size;
 215 };
 216
 217 // Represents a piece of data "inside" the extern-worker
 218 // framework. The data may not even exist locally (it could be on disk
 219 // or in the network). A Ref is basically a RefId and the type of the
 220 // data. Only Client can create Refs, so "type-punning" is
 221 // impossible. A Ref is only usable with the Client that produced it
 222 // and cannot outlive the Client.
 223 template <typename T>
 224 struct Ref {
 225   const RefId& id() const { return m_id; }
 226   // Whether this ref came from a "fallback" operation (see below with
 227   // Client). This is exposed mainly for testing. Users shouldn't
 228   // care.
 229   bool fromFallback() const { return m_fromFallback; }
 230 private:
 231   Ref(RefId, bool);
 232   RefId m_id;
 233   bool m_fromFallback;
 234   friend struct Client;
 235 };
 236
 237 //////////////////////////////////////////////////////////////////////
 238
 239 // This is meant for internal usage and is here (and not in detail) so
 240 // implementations outside of these files can use it. It represents a
 241 // particular operation and has some stuff for tracking time and
 242 // TRACE.
 243 struct RequestId {
 244   explicit RequestId(const char* type);
 245   ~RequestId();
 246
 247   RequestId(const RequestId&) = delete;
 248   RequestId(RequestId&&) = default;
 249   RequestId& operator=(const RequestId&) = delete;
 250   RequestId& operator=(RequestId&&) = default;
 251
 252   std::string tracePrefix() const;
 253   std::string toString() const;
 254
 255 private:
 256   uint64_t m_id;
 257   const char* m_type;
 258   Optional<detail::Timer> m_timer;
 259
 260   static std::atomic<uint64_t> s_next;
 261   static std::atomic<uint64_t> s_active;
 262
 263   TRACE_SET_MOD(extern_worker);
 264 };
 265
 266 //////////////////////////////////////////////////////////////////////
 267
 268 // More stuff for the Client/Client::Impl interface here out of
 269 // convenience.
 270 using IdVec = std::vector<RefId>;
 271 // A "blob" is a string containing some arbitrary binary data
 272 using BlobVec = std::vector<std::string>;
 273 using PathVec = std::vector<std::filesystem::path>;
 274
 275 // These are used to describe inputs in a generic way to
 276 // Client::Impl. An input can be a RefId, an optional RefId, or a
 277 // vector of RefIds.
 278 using RefVal = boost::variant<RefId, Optional<RefId>, IdVec>;
 279 using RefValVec = std::vector<RefVal>;
 280
 281 // Likewise, these describe outputs to Client::Impl. We only need to
 282 // represent the type since there's no id beforehand.
 283 enum class OutputType { Val, Opt, Vec };
 284
 285 //////////////////////////////////////////////////////////////////////
 286
 287 // Configeration controlling the behavior of Client.
 288 struct Options {
 289   // Whether to use the always available "subprocess"
 290   // implementation. This uses fork+exec (and stores data on disk).
 291   enum class UseSubprocess {
 292     Always, // Always use subprocess
 293     Fallback, // Attempt to use another backend, but if not available,
 294               // use subprocess.
 295     Never // Never use subprocess. Throw error if nothing else is
 296           // available.
 297   };
 298   Options& setUseSubprocess(UseSubprocess u) {
 299     m_useSubprocess = u;
 300     return *this;
 301   }
 302
 303   // The implementation may need to store data on disk (subprocess for
 304   // example). Location where to store such things.
 305   Options& setWorkingDir(std::filesystem::path dir) {
 306     m_workingDir = std::move(dir);
 307     return *this;
 308   }
 309
 310   // Time out on job execution. Best effort, implementations may not
 311   // support it (subprocess does not).
 312   Options& setTimeout(std::chrono::seconds s) {
 313     m_timeout = s;
 314     return *this;
 315   }
 316
 317   // Whether to log verbosely
 318   Options& setVerboseLogging(bool v) {
 319     m_verboseLogging = v;
 320     return *this;
 321   }
 322
 323   // Whether to cache execution of jobs. Not all implementations cache
 324   // execution (subprocess does not), so is a noop on those.
 325   Options& setCacheExecs(bool c) {
 326     m_cacheExecs = c;
 327     return *this;
 328   }
 329
 330   // The minimum TTL before a cache entry is refreshed. If the
 331   // implementation caches data, we'll consider the data as "not
 332   // present" if its TTL drops below this value. This allows us to
 333   // re-upload the value (and refresh its TTL) before it actually
 334   // expires.
 335   Options& setMinTTL(std::chrono::seconds s) {
 336     m_minTTL = s;
 337     return *this;
 338   }
 339
 340   // Implementations which rely on hashing can use EdenFS to avoid
 341   // hashing the file. This controls that.
 342   Options& setUseEdenFS(bool u) {
 343     m_useEdenFS = u;
 344     return *this;
 345   }
 346
 347   // Whether to cleanup data stored on disk when Client is
 348   // destroyed. This can take a very long time for lots of data (and
 349   // can hinder debugging), so can be disabled.
 350   Options& setCleanup(bool c) {
 351     m_cleanup = c;
 352     return *this;
 353   }
 354
 355   // Some implementations have a notion of "use-case". This provides
 356   // one which can control whether those implementations are enabled.
 357   Options& setUseCase(std::string u) {
 358     m_useCase = std::move(u);
 359     return *this;
 360   }
 361
 362   // If the backend is busy, retry the action this number of times (0
 363   // disables retrying).
 364   Options& setThrottleRetries(size_t r) {
 365     m_throttleRetries = r;
 366     return *this;
 367   }
 368
 369   // Each time we retry because of throttling, we will wait up to
 370   // twice as long as the previous time. This is the amount of time we
 371   // wait the first time (so everything is scaled from it).
 372   Options& setThrottleBaseWait(std::chrono::milliseconds m) {
 373     m_throttleBaseWait = m;
 374     return *this;
 375   }
 376
 377   // The below options are RE specific and not documented:
 378   Options& setUseRichClient(bool b) {
 379     m_useRichClient = b;
 380     return *this;
 381   }
 382
 383   Options& setUseZippyRichClient(bool b) {
 384     m_useZippyRichClient = b;
 385     return *this;
 386   }
 387
 388   Options& setUseP2P(bool b) {
 389     m_useP2P = b;
 390     return *this;
 391   }
 392
 393   UseSubprocess m_useSubprocess{UseSubprocess::Fallback};
 394   std::filesystem::path m_workingDir{std::filesystem::temp_directory_path()};
 395   std::chrono::seconds m_timeout{std::chrono::minutes{15}};
 396   std::chrono::seconds m_minTTL{std::chrono::hours{3}};
 397   std::chrono::milliseconds m_throttleBaseWait{25};
 398   size_t m_throttleRetries{7};
 399   bool m_verboseLogging{false};
 400   bool m_cacheExecs{true};
 401   bool m_useEdenFS{true};
 402   bool m_cleanup{true};
 403   bool m_useRichClient{true};
 404   bool m_useZippyRichClient{false};
 405   bool m_useP2P{false};
 406   std::string m_useCase;
 407 };
 408
 409 //////////////////////////////////////////////////////////////////////
 410
 411 // Encapsulates an instance of the extern-worker framework. This is
 412 // responsible for producing/consuming Refs, and executing jobs with
 413 // those Refs as inputs. The actual behavior of the Client (IE, where
 414 // it stores the data, and where the workers run), depends on the
 415 // specific Client::Impl in use. An implementation which uses
 416 // fork+exec (and stores data on disk) is always available and will be
 417 // used if nothing else is available (if so requested in Options). The
 418 // fork+exec implementation can also serve as a "fallback"
 419 // implementation if the main implementation throws an error. The
 420 // assumption is that the fork+exec implementation is more reliable
 421 // than anything else. For the most part, this is invisible to the
 422 // user (though it may cause performance degradation).
 423
 424 struct Client {
 425   // Create a new Client with the given set of Options, and an
 426   // Executor. The executor will be used for any coro things if the
 427   // implementation requires it.
 428   explicit Client(folly::Executor::KeepAlive<>, const Options& = {});
 429   ~Client();
 430
 431   // Return a descriptive string of the implementation currently in
 432   // use. Mainly for logging.
 433   const std::string& implName() const;
 434   // Return true if the implementation in use is the built-in
 435   // fork+exec implementation.
 436   bool usingSubprocess() const;
 437   // Return true if the implementation in use supports "optimistic"
 438   // storing.
 439   bool supportsOptimistic() const;
 440
 441   // If we've fallen back (for at least one action) to the built-in
 442   // subprocess implementation (this is false if the implementation
 443   // was subprocess to begin with).
 444   bool fellback() const;
 445
 446   // Loading. These take various different permutations of Refs, load
 447   // them, deserialize the blobs into the appropriate types, and
 448   // return the data in a matching format. Using the variations which
 449   // take multiple at once is more efficient than using multiple
 450   // calls.
 451   template <typename T> coro::Task<T> load(Ref<T>);
 452
 453   template <typename T, typename... Ts>
 454   coro::Task<std::tuple<T, Ts...>> load(Ref<T>, Ref<Ts>...);
 455
 456   template <typename T> coro::Task<std::vector<T>> load(std::vector<Ref<T>>);
 457
 458   template <typename T, typename... Ts>
 459   coro::Task<std::vector<std::tuple<T, Ts...>>>
 460   load(std::vector<std::tuple<Ref<T>, Ref<Ts>...>>);
 461
 462   // Storing files. These take either a path, or a vector of paths,
 463   // and upload the contents. This is semantically equivalent to
 464   // reading the file yourself and uploading the data as
 465   // blobs. However, it might be more efficient as some
 466   // implementations can deal with on-disk files specially. Note that
 467   // the returned Refs are for strings, since you're uploading the
 468   // contents of the file. Optimistic mode (if supported) won't ever
 469   // actually store anything. It will just generate the Refs and
 470   // assume the data is already stored.
 471   coro::Task<Ref<std::string>> storeFile(std::filesystem::path,
 472                                          bool optimistic = false);
 473
 474   coro::Task<std::vector<Ref<std::string>>>
 475   storeFile(std::vector<std::filesystem::path>,
 476             bool optimistic = false);
 477
 478   // Storing blobs. These take various different permutations of data,
 479   // serialize them (using BlobEncoder), store however the
 480   // implementation does, and return the appropriate Refs for
 481   // them. These have different names to avoid ambiguities (do you
 482   // want to upload a single vector of T, or multiple Ts passed as
 483   // vector?).
 484   template <typename T> coro::Task<Ref<T>> store(T);
 485
 486   template <typename T, typename... Ts>
 487   coro::Task<std::tuple<Ref<T>, Ref<Ts>...>> store(T, Ts...);
 488
 489   template <typename T> coro::Task<Ref<T>> storeOptimistically(T);
 490
 491   template <typename T, typename... Ts>
 492   coro::Task<std::tuple<Ref<T>, Ref<Ts>...>> storeOptimistically(T, Ts...);
 493
 494   template <typename T>
 495   coro::Task<std::vector<Ref<T>>> storeMulti(std::vector<T>,
 496                                              bool optimistic = false);
 497
 498   template <typename T, typename... Ts>
 499   coro::Task<std::vector<std::tuple<Ref<T>, Ref<Ts>...>>>
 500   storeMultiTuple(std::vector<std::tuple<T, Ts...>>,
 501                   bool optimistic = false);
 502
 503   // Execute a job with the given sets of inputs (and any config setup
 504   // params). The output of those job executions will be returned as a
 505   // vector of Refs. The exact format of the inputs and outputs is
 506   // determined (at compile time) by the job being run and matches the
 507   // job's specification. If "optimistic" is set to true, then at
 508   // least one of the inputs was stored using the optimistic
 509   // flag. This means the inputs may not actually exist on the worker
 510   // side. If it doesn't, the execution will fail (by throwing an
 511   // exception), and the caller should (actually) store the data and
 512   // retry. The flag disables automatic fallback.
 513   template <typename C> coro::Task<typename Job<C>::ExecT>
 514   exec(const Job<C>& job,
 515        typename Job<C>::ConfigT config,
 516        std::vector<typename Job<C>::InputsT> inputs,
 517        bool optimistic = false);
 518
 519   // Statistics about the usage of this extern-worker.
 520   struct Stats {
 521     // Files whose contents were read from disk (on EdenFS we might
 522     // not have to actually read the file).
 523     std::atomic<size_t> filesRead{0};
 524
 525     // Total number of files and blobs we "stored" (they might have
 526     // had to be uploaded).
 527     std::atomic<size_t> files{0};
 528     std::atomic<size_t> blobs{0};
 529
 530     // Number of times we had to query the back-end if a file or blob
 531     // is present. Using "optimistic" uploading, we might be able to
 532     // skip checking.
 533     std::atomic<size_t> filesQueried{0};
 534     std::atomic<size_t> blobsQueried{0};
 535
 536     // Number of files or blobs actually uploaded.
 537     std::atomic<size_t> filesUploaded{0};
 538     std::atomic<size_t> blobsUploaded{0};
 539
 540     // Number of bytes for files or blobs actually uploaded.
 541     std::atomic<size_t> fileBytesUploaded{0};
 542     std::atomic<size_t> blobBytesUploaded{0};
 543
 544     // Number of times we fell back when uploading a file or blob.
 545     std::atomic<size_t> fileFallbacks{0};
 546     std::atomic<size_t> blobFallbacks{0};
 547
 548     // Number of blobs/bytes downloaded (because of a load call).
 549     std::atomic<size_t> downloads{0};
 550     std::atomic<size_t> bytesDownloaded{0};
 551
 552     // Total number of execs attempted (per input).
 553     std::atomic<size_t> execs{0};
 554     // Execs which hit the result cache
 555     std::atomic<size_t> execCacheHits{0};
 556     // Execs which fellback
 557     std::atomic<size_t> execFallbacks{0};
 558
 559     std::atomic<size_t> execCpuUsec{0};
 560     std::atomic<size_t> execAllocatedCores{0};
 561     std::atomic<size_t> execMaxUsedMem{0};
 562     std::atomic<size_t> execReservedMem{0};
 563
 564     // Execs in optimistic mode which succeeded
 565     std::atomic<size_t> optimisticExecs{0};
 566
 567     std::atomic<size_t> throttles{0};
 568
 569     void reset() {
 570       filesRead.store(0);
 571       files.store(0);
 572       blobs.store(0);
 573       filesQueried.store(0);
 574       blobsQueried.store(0);
 575       filesUploaded.store(0);
 576       blobsUploaded.store(0);
 577       fileBytesUploaded.store(0);
 578       blobBytesUploaded.store(0);
 579       fileFallbacks.store(0);
 580       blobFallbacks.store(0);
 581       downloads.store(0);
 582       bytesDownloaded.store(0);
 583       execs.store(0);
 584       execCacheHits.store(0);
 585       execFallbacks.store(0);
 586       execCpuUsec.store(0);
 587       execAllocatedCores.store(0);
 588       execMaxUsedMem.store(0);
 589       execReservedMem.store(0);
 590       optimisticExecs.store(0);
 591       throttles.store(0);
 592     }
 593   };
 594   const Stats& getStats() const { return m_stats; }
 595   void resetStats() { m_stats.reset(); }
 596
 597   // Synthetically force a fallback event when storing data or
 598   // executing a job, as if the implementation failed. This is for
 599   // tests to force the fallback path to be exercised. You don't need
 600   // this otherwise.
 601   void forceFallback()   { m_forceFallback = true; }
 602   void unforceFallback() { m_forceFallback = false; }
 603
 604   struct Impl;
 605
 606 private:
 607   std::unique_ptr<Impl> m_impl;
 608   LockFreeLazy<std::unique_ptr<Impl>> m_fallbackImpl;
 609   Options m_options;
 610   Stats m_stats;
 611   bool m_forceFallback;
 612
 613   template <typename T> coro::Task<Ref<T>> storeImpl(bool, T);
 614
 615   template <typename T, typename... Ts>
 616   coro::Task<std::tuple<Ref<T>, Ref<Ts>...>> storeImpl(bool, T, Ts...);
 617
 618   template <typename T, typename F>
 619   coro::Task<T> tryWithThrottling(const F&);
 620
 621   template <typename T, typename F>
 622   coro::Task<T> tryWithFallback(const F&, bool&, bool noFallback = false);
 623
 624   template <typename T> static T unblobify(std::string&&);
 625   template <typename T> static std::string blobify(T&&);
 626
 627   static const std::array<OutputType, 1> s_valOutputType;
 628   static const std::array<OutputType, 1> s_vecOutputType;
 629   static const std::array<OutputType, 1> s_optOutputType;
 630
 631   std::unique_ptr<Impl> makeFallbackImpl();
 632
 633   TRACE_SET_MOD(extern_worker);
 634 };
 635
 636 //////////////////////////////////////////////////////////////////////
 637
 638 // Actual implementation for Client. Implementations of Client::Impl
 639 // control how data is stored and where the work is actually
 640 // executed. Client always provides a "subprocess" implementation
 641 // which uses the disk and fork+exec.
 642 struct Client::Impl {
 643   virtual ~Impl() = default;
 644
 645   // Name of the implementation. Mainly for logging.
 646   const std::string& name() const { return m_name; }
 647
 648   // Whether this is a the special subprocess impl. Its treated
 649   // specially when it comes to falling back.
 650   virtual bool isSubprocess() const = 0;
 651   // Whether this impl supports optimistic uploading (or whether its
 652   // profitable to do so).
 653   virtual bool supportsOptimistic() const = 0;
 654   // An implementation can declare itself "disabled" at any point (for
 655   // example, due to some internal error). After that point, either
 656   // Client will fail, or the fallback subprocess implementation will
 657   // be used instead (depending on config).
 658   virtual bool isDisabled() const = 0;
 659
 660   // Load some number of RefIds, returning them as blobs (in the same
 661   // order as requested).
 662   virtual coro::Task<BlobVec> load(const RequestId& requestId,
 663                                    IdVec ids) = 0;
 664   // Store some number of files and/or blobs, returning their
 665   // associated RefIds (in the same order as requested, with files
 666   // before blobs).
 667   virtual coro::Task<IdVec> store(const RequestId& requestId,
 668                                   PathVec files,
 669                                   BlobVec blobs,
 670                                   bool optimistic) = 0;
 671
 672   // Execute a job with the given sets of inputs. The job will be
 673   // executed on a worker, with the job's run function called once for
 674   // each set of inputs.
 675   virtual coro::Task<std::vector<RefValVec>>
 676   exec(const RequestId& requestId,
 677        const std::string& command,
 678        RefValVec config,
 679        std::vector<RefValVec> inputs,
 680        const folly::Range<const OutputType*>& output,
 681        const folly::Range<const OutputType*>* finiOutput) = 0;
 682 protected:
 683   Impl(std::string name, Client& parent)
 684     : m_name{std::move(name)}
 685     , m_parent{parent} {}
 686
 687   Client::Stats& stats() { return m_parent.m_stats; }
 688
 689   template <typename T, typename F>
 690   static coro::Task<T> tryWithThrottling(size_t,
 691                                          std::chrono::milliseconds,
 692                                          std::atomic<size_t>&,
 693                                          const F&);
 694 private:
 695   std::string m_name;
 696   Client& m_parent;
 697
 698   static void throttleSleep(size_t, std::chrono::milliseconds);
 699
 700   friend struct Client;
 701 };
 702
 703 // If true, we're running inside a job.
 704 extern thread_local bool g_in_job;
 705
 706 // Hook for providing an implementation. An implementation can set
 707 // g_impl_hook to a function which optionally creates a Client::Impl.
 708 using ImplHook =
 709   std::unique_ptr<Client::Impl>(*)(
 710     const Options&,
 711     folly::Executor::KeepAlive<>,
 712     Client&
 713   );
 714 extern ImplHook g_impl_hook;
 715
 716 //////////////////////////////////////////////////////////////////////
 717
 718 // Maps a key to a Ref<V>, automatically storing the data if needed.
 719 template <typename K, typename V>
 720 struct RefCache {
 721   explicit RefCache(Client&);
 722
 723   // Lookup the associated Ref for the given key. If there's no entry,
 724   // store the given value (using the Client provided in the ctor) and
 725   // return the Ref created.
 726   coro::Task<Ref<V>> get(const K&, const V&, folly::Executor::KeepAlive<>);
 727 private:
 728   coro::AsyncMap<K, Ref<V>> m_map;
 729   Client& m_client;
 730 };
 731
 732 //////////////////////////////////////////////////////////////////////
 733
 734 }
 735
 736 //////////////////////////////////////////////////////////////////////
 737
 738 #define incl_HPHP_EXTERN_WORKER_INL_H_
 739 #include "hphp/util/extern-worker-inl.h"
 740 #undef incl_HPHP_EXTERN_WORKER_INL_H_