rtkaio/sysdeps/unix/sysv/linux/kaio_misc.c

   1 /* Handle general operations.
   2    Copyright (C) 1997,1998,1999,2000,2001,2002,2003
   3    Free Software Foundation, Inc.
   4    This file is part of the GNU C Library.
   5    Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
   6
   7    The GNU C Library is free software; you can redistribute it and/or
   8    modify it under the terms of the GNU Lesser General Public
   9    License as published by the Free Software Foundation; either
  10    version 2.1 of the License, or (at your option) any later version.
  11
  12    The GNU C Library is distributed in the hope that it will be useful,
  13    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    Lesser General Public License for more details.
  16
  17    You should have received a copy of the GNU Lesser General Public
  18    License along with the GNU C Library; if not, write to the Free
  19    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  20    02111-1307 USA.  */
  21
  22 #include <kaio_misc.h>
  23
  24 #ifndef USE_KAIO
  25 #include <aio_misc.c>
  26 #else
  27
  28 #include <aio.h>
  29 #include <assert.h>
  30 #include <errno.h>
  31 #include <limits.h>
  32 #include <pthread.h>
  33 #include <stdlib.h>
  34 #include <unistd.h>
  35 #include <sys/stat.h>
  36 #include <sys/time.h>
  37
  38 static void add_request_to_runlist (struct requestlist *newrequest)
  39         internal_function;
  40 static int add_request_to_list (struct requestlist *newrequest, int fildes,
  41                                 int prio)
  42         internal_function;
  43 static void * handle_kernel_aio (void *arg);
  44 static void kernel_callback (kctx_t ctx, struct kiocb *kiocb, long res,
  45                              long res2);
  46
  47 /* Pool of request list entries.  */
  48 static struct requestlist **pool;
  49
  50 /* Number of total and allocated pool entries.  */
  51 static size_t pool_max_size;
  52 static size_t pool_size;
  53
  54 /* Kernel AIO context.  */
  55 kctx_t __aio_kioctx = KCTX_NONE;
  56 int __have_no_kernel_aio;
  57 int __kernel_thread_started;
  58
  59 /* We implement a two dimensional array but allocate each row separately.
  60    The macro below determines how many entries should be used per row.
  61    It should better be a power of two.  */
  62 #define ENTRIES_PER_ROW 32
  63
  64 /* How many rows we allocate at once.  */
  65 #define ROWS_STEP       8
  66
  67 /* List of available entries.  */
  68 static struct requestlist *freelist;
  69
  70 /* List of request waiting to be processed.  */
  71 static struct requestlist *runlist;
  72
  73 /* Structure list of all currently processed requests.  */
  74 static struct requestlist *requests, *krequests;
  75
  76 /* Number of threads currently running.  */
  77 static int nthreads;
  78
  79 /* Number of threads waiting for work to arrive. */
  80 static int idle_thread_count;
  81
  82
  83 /* These are the values used to optimize the use of AIO.  The user can
  84    overwrite them by using the `aio_init' function.  */
  85 static struct aioinit optim =
  86 {
  87   20,   /* int aio_threads;     Maximal number of threads.  */
  88   64,   /* int aio_num;         Number of expected simultanious requests. */
  89   0,
  90   0,
  91   0,
  92   0,
  93   1,
  94   0
  95 };
  96
  97
  98 /* Since the list is global we need a mutex protecting it.  */
  99 pthread_mutex_t __aio_requests_mutex = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP;
 100
 101 /* When you add a request to the list and there are idle threads present,
 102    you signal this condition variable. When a thread finishes work, it waits
 103    on this condition variable for a time before it actually exits. */
 104 pthread_cond_t __aio_new_request_notification = PTHREAD_COND_INITIALIZER;
 105
 106
 107 /* Functions to handle request list pool.  */
 108 static struct requestlist *
 109 get_elem (void)
 110 {
 111   struct requestlist *result;
 112
 113   if (freelist == NULL)
 114     {
 115       struct requestlist *new_row;
 116       int cnt;
 117
 118       assert (sizeof (struct aiocb) == sizeof (struct aiocb64));
 119
 120       if (pool_size + 1 >= pool_max_size)
 121         {
 122           size_t new_max_size = pool_max_size + ROWS_STEP;
 123           struct requestlist **new_tab;
 124
 125           new_tab = (struct requestlist **)
 126             realloc (pool, new_max_size * sizeof (struct requestlist *));
 127
 128           if (new_tab == NULL)
 129             return NULL;
 130
 131           pool_max_size = new_max_size;
 132           pool = new_tab;
 133         }
 134
 135       /* Allocate the new row.  */
 136       cnt = pool_size == 0 ? optim.aio_num : ENTRIES_PER_ROW;
 137       new_row = (struct requestlist *) calloc (cnt,
 138                                                sizeof (struct requestlist));
 139       if (new_row == NULL)
 140         return NULL;
 141
 142       pool[pool_size++] = new_row;
 143
 144       /* Put all the new entries in the freelist.  */
 145       do
 146         {
 147           new_row->next_prio = freelist;
 148           freelist = new_row++;
 149         }
 150       while (--cnt > 0);
 151     }
 152
 153   result = freelist;
 154   freelist = freelist->next_prio;
 155
 156   return result;
 157 }
 158
 159
 160 void
 161 internal_function
 162 __aio_free_request (struct requestlist *elem)
 163 {
 164   elem->running = no;
 165   elem->next_prio = freelist;
 166   freelist = elem;
 167 }
 168
 169
 170 struct requestlist *
 171 internal_function
 172 __aio_find_req (aiocb_union *elem)
 173 {
 174   struct requestlist *runp;
 175   int fildes = elem->aiocb.aio_fildes;
 176   int i;
 177
 178   for (i = 0; i < 2; i++)
 179     {
 180       runp = i ? requests : krequests;
 181
 182       while (runp != NULL && runp->aiocbp->aiocb.aio_fildes < fildes)
 183         runp = runp->next_fd;
 184
 185       if (runp != NULL)
 186         {
 187           if (runp->aiocbp->aiocb.aio_fildes != fildes)
 188             runp = NULL;
 189           else
 190             while (runp != NULL && runp->aiocbp != elem)
 191               runp = runp->next_prio;
 192           if (runp != NULL)
 193             return runp;
 194         }
 195     }
 196
 197   return NULL;
 198 }
 199
 200
 201 struct requestlist *
 202 internal_function
 203 __aio_find_req_fd (int fildes)
 204 {
 205   struct requestlist *runp = requests;
 206
 207   while (runp != NULL && runp->aiocbp->aiocb.aio_fildes < fildes)
 208     runp = runp->next_fd;
 209
 210   return (runp != NULL && runp->aiocbp->aiocb.aio_fildes == fildes
 211           ? runp : NULL);
 212 }
 213
 214
 215 struct requestlist *
 216 internal_function
 217 __aio_find_kreq_fd (int fildes)
 218 {
 219   struct requestlist *runp = krequests;
 220
 221   while (runp != NULL && runp->aiocbp->aiocb.aio_fildes < fildes)
 222     runp = runp->next_fd;
 223
 224   return (runp != NULL && runp->aiocbp->aiocb.aio_fildes == fildes
 225           ? runp : NULL);
 226 }
 227
 228
 229 void
 230 internal_function
 231 __aio_remove_request (struct requestlist *last, struct requestlist *req,
 232                       int all)
 233 {
 234   assert (req->running == yes || req->running == queued
 235           || req->running == done);
 236   assert (req->kioctx == KCTX_NONE);
 237
 238   if (last != NULL)
 239     last->next_prio = all ? NULL : req->next_prio;
 240   else
 241     {
 242       if (all || req->next_prio == NULL)
 243         {
 244           if (req->last_fd != NULL)
 245             req->last_fd->next_fd = req->next_fd;
 246           else
 247             requests = req->next_fd;
 248           if (req->next_fd != NULL)
 249             req->next_fd->last_fd = req->last_fd;
 250         }
 251       else
 252         {
 253           if (req->last_fd != NULL)
 254             req->last_fd->next_fd = req->next_prio;
 255           else
 256             requests = req->next_prio;
 257
 258           if (req->next_fd != NULL)
 259             req->next_fd->last_fd = req->next_prio;
 260
 261           req->next_prio->last_fd = req->last_fd;
 262           req->next_prio->next_fd = req->next_fd;
 263
 264           /* Mark this entry as runnable.  */
 265           req->next_prio->running = yes;
 266         }
 267
 268       if (req->running == yes)
 269         {
 270           struct requestlist *runp = runlist;
 271
 272           last = NULL;
 273           while (runp != NULL)
 274             {
 275               if (runp == req)
 276                 {
 277                   if (last == NULL)
 278                     runlist = runp->next_run;
 279                   else
 280                     last->next_run = runp->next_run;
 281                   break;
 282                 }
 283               last = runp;
 284               runp = runp->next_run;
 285             }
 286         }
 287     }
 288 }
 289
 290 void
 291 internal_function
 292 __aio_remove_krequest (struct requestlist *req)
 293 {
 294   assert (req->running == yes || req->running == queued
 295           || req->running == done);
 296   assert (req->kioctx != KCTX_NONE);
 297
 298   if (req->prev_prio != NULL)
 299     {
 300       req->prev_prio->next_prio = req->next_prio;
 301       if (req->next_prio != NULL)
 302         req->next_prio->prev_prio = req->prev_prio;
 303     }
 304   else if (req->next_prio == NULL)
 305     {
 306       if (req->last_fd != NULL)
 307         req->last_fd->next_fd = req->next_fd;
 308       else
 309         krequests = req->next_fd;
 310       if (req->next_fd != NULL)
 311         req->next_fd->last_fd = req->last_fd;
 312     }
 313   else
 314     {
 315       if (req->last_fd != NULL)
 316         req->last_fd->next_fd = req->next_prio;
 317       else
 318         krequests = req->next_prio;
 319       if (req->next_fd != NULL)
 320         req->next_fd->last_fd = req->next_prio;
 321
 322       req->next_prio->prev_prio = NULL;
 323       req->next_prio->last_fd = req->last_fd;
 324       req->next_prio->next_fd = req->next_fd;
 325     }
 326 }
 327
 328
 329 /* The thread handler.  */
 330 static void *handle_fildes_io (void *arg);
 331 static int wait_for_kernel_requests (int fildes);
 332
 333
 334 /* User optimization.  */
 335 void
 336 __aio_init (const struct aioinit *init)
 337 {
 338   /* Get the mutex.  */
 339   pthread_mutex_lock (&__aio_requests_mutex);
 340
 341   /* Only allow writing new values if the table is not yet allocated.  */
 342   if (pool == NULL)
 343     {
 344       optim.aio_threads = init->aio_threads < 1 ? 1 : init->aio_threads;
 345       optim.aio_num = (init->aio_num < ENTRIES_PER_ROW
 346                        ? ENTRIES_PER_ROW
 347                        : init->aio_num & ~ENTRIES_PER_ROW);
 348     }
 349
 350   if (init->aio_idle_time != 0)
 351     optim.aio_idle_time = init->aio_idle_time;
 352
 353   /* Release the mutex.  */
 354   pthread_mutex_unlock (&__aio_requests_mutex);
 355 }
 356 weak_alias (__aio_init, aio_init)
 357
 358 static void
 359 kernel_callback (kctx_t ctx, struct kiocb *kiocb, long res, long res2)
 360 {
 361   struct requestlist *req = (struct requestlist *)kiocb;
 362
 363   req->aiocbp->aiocb.__error_code = 0;
 364   req->aiocbp->aiocb.__return_value = res;
 365   if (res < 0 && res > -1000)
 366     {
 367       req->aiocbp->aiocb.__error_code = -res;
 368       req->aiocbp->aiocb.__return_value = -1;
 369     }
 370   __aio_notify (req);
 371   assert (req->running == allocated);
 372   req->running = done;
 373   __aio_remove_krequest (req);
 374   __aio_free_request (req);
 375 }
 376
 377 void
 378 internal_function
 379 __aio_read_one_event (void)
 380 {
 381   struct kio_event ev[10];
 382   struct timespec ts;
 383   int count, i;
 384
 385   if (__aio_kioctx == KCTX_NONE)
 386     return;
 387   ts.tv_sec = 0;
 388   ts.tv_nsec = 0;
 389   do
 390     {
 391       INTERNAL_SYSCALL_DECL (err);
 392       count = INTERNAL_SYSCALL (io_getevents, err, 5, __aio_kioctx, 0, 10,
 393                                 ev, &ts);
 394       if (INTERNAL_SYSCALL_ERROR_P (count, err) || count == 0)
 395         break;
 396       pthread_mutex_lock (&__aio_requests_mutex);
 397       for (i = 0; i < count; i++)
 398         {
 399           void (*cb)(kctx_t, struct kiocb *, long, long);
 400
 401           cb = (void *) (uintptr_t) ev[i].kioe_data;
 402           cb (__aio_kioctx, (struct kiocb *) (uintptr_t) ev[i].kioe_obj,
 403               ev[i].kioe_res, ev[i].kioe_res2);
 404         }
 405       pthread_mutex_unlock (&__aio_requests_mutex);
 406     }
 407   while (count == 10);
 408 }
 409
 410 int
 411 internal_function
 412 __aio_wait_for_events (kctx_t kctx, const struct timespec *timespec)
 413 {
 414   int ret, i;
 415   struct kio_event ev[10];
 416   struct timespec ts;
 417   INTERNAL_SYSCALL_DECL (err);
 418
 419   pthread_mutex_unlock (&__aio_requests_mutex);
 420   ts.tv_sec = 0;
 421   ts.tv_nsec = 0;
 422   do
 423     {
 424       ret = INTERNAL_SYSCALL (io_getevents, err, 5, kctx, 0, 10, ev,
 425                               timespec);
 426       if (INTERNAL_SYSCALL_ERROR_P (ret, err) || ret == 0)
 427         break;
 428
 429       pthread_mutex_lock (&__aio_requests_mutex);
 430       for (i = 0; i < ret; i++)
 431         {
 432           void (*cb)(kctx_t, struct kiocb *, long, long);
 433
 434           cb = (void *) (uintptr_t) ev[i].kioe_data;
 435           cb (kctx, (struct kiocb *) (uintptr_t) ev[i].kioe_obj,
 436               ev[i].kioe_res, ev[i].kioe_res2);
 437         }
 438       if (ret < 10)
 439         return 0;
 440       pthread_mutex_unlock (&__aio_requests_mutex);
 441       timespec = &ts;
 442     }
 443   while (1);
 444
 445   pthread_mutex_lock (&__aio_requests_mutex);
 446   return (timespec != &ts
 447           && INTERNAL_SYSCALL_ERROR_P (ret, err)
 448           && INTERNAL_SYSCALL_ERRNO (ret, err) == ETIMEDOUT) ? ETIMEDOUT : 0;
 449 }
 450
 451 int
 452 internal_function
 453 __aio_create_kernel_thread (void)
 454 {
 455   pthread_t thid;
 456   pthread_attr_t attr;
 457
 458   if (__kernel_thread_started)
 459     return 0;
 460
 461   /* Make sure the thread is created detached.  */
 462   pthread_attr_init (&attr);
 463   pthread_attr_setdetachstate (&attr, PTHREAD_CREATE_DETACHED);
 464
 465   if (pthread_create (&thid, &attr, handle_kernel_aio, NULL) != 0)
 466     return -1;
 467   __kernel_thread_started = 1;
 468   return 0;
 469 }
 470
 471 static void *
 472 handle_kernel_aio (void *arg __attribute__((unused)))
 473 {
 474   int ret, i;
 475   INTERNAL_SYSCALL_DECL (err);
 476   struct kio_event ev[10];
 477
 478   for (;;)
 479     {
 480       ret = INTERNAL_SYSCALL (io_getevents, err, 5, __aio_kioctx, 0, 10, ev,
 481                               NULL);
 482       if (INTERNAL_SYSCALL_ERROR_P (ret, err) || ret == 0)
 483         continue;
 484       pthread_mutex_lock (&__aio_requests_mutex);
 485       for (i = 0; i < ret; i++)
 486         {
 487           void (*cb)(kctx_t, struct kiocb *, long, long);
 488
 489           cb = (void *) (uintptr_t) ev[i].kioe_data;
 490           cb (__aio_kioctx, (struct kiocb *) (uintptr_t) ev[i].kioe_obj,
 491               ev[i].kioe_res, ev[i].kioe_res2);
 492         }
 493       pthread_mutex_unlock (&__aio_requests_mutex);
 494     }
 495   return NULL;
 496 }
 497
 498 static int
 499 internal_function
 500 add_request_to_list (struct requestlist *newp, int fildes, int prio)
 501 {
 502   struct requestlist *last, *runp, *reqs;
 503
 504   last = NULL;
 505   reqs = newp->kioctx != KCTX_NONE ? krequests : requests;
 506   runp = reqs;
 507
 508   /* First look whether the current file descriptor is currently
 509      worked with.  */
 510   while (runp != NULL
 511          && runp->aiocbp->aiocb.aio_fildes < fildes)
 512     {
 513       last = runp;
 514       runp = runp->next_fd;
 515     }
 516
 517   if (runp != NULL
 518       && runp->aiocbp->aiocb.aio_fildes == fildes)
 519     {
 520       /* The current file descriptor is worked on.  It makes no sense
 521          to start another thread since this new thread would fight
 522          with the running thread for the resources.  But we also cannot
 523          say that the thread processing this desriptor shall immediately
 524          after finishing the current job process this request if there
 525          are other threads in the running queue which have a higher
 526          priority.  */
 527
 528       /* Simply enqueue it after the running one according to the
 529          priority.  */
 530       while (runp->next_prio != NULL
 531              && runp->next_prio->aiocbp->aiocb.__abs_prio >= prio)
 532         runp = runp->next_prio;
 533
 534       newp->next_prio = runp->next_prio;
 535       runp->next_prio = newp;
 536       if (newp->kioctx != KCTX_NONE)
 537         {
 538           newp->prev_prio = runp;
 539           if (newp->next_prio != NULL)
 540             newp->next_prio->prev_prio = newp;
 541         }
 542       return queued;
 543     }
 544   else
 545     {
 546       /* Enqueue this request for a new descriptor.  */
 547       if (last == NULL)
 548         {
 549           newp->last_fd = NULL;
 550           newp->next_fd = reqs;
 551           if (reqs != NULL)
 552             reqs->last_fd = newp;
 553           if (newp->kioctx != KCTX_NONE)
 554             krequests = newp;
 555           else
 556             requests = newp;
 557         }
 558       else
 559         {
 560           newp->next_fd = last->next_fd;
 561           newp->last_fd = last;
 562           last->next_fd = newp;
 563           if (newp->next_fd != NULL)
 564             newp->next_fd->last_fd = newp;
 565         }
 566
 567       newp->next_prio = NULL;
 568       if (newp->kioctx != KCTX_NONE)
 569         newp->prev_prio = NULL;
 570       return yes;
 571     }
 572 }
 573
 574 static int
 575 internal_function
 576 __aio_enqueue_user_request (struct requestlist *newp)
 577 {
 578   int result = 0;
 579   int running = add_request_to_list (newp, newp->aiocbp->aiocb.aio_fildes,
 580                                      newp->aiocbp->aiocb.__abs_prio);
 581
 582   if (running == yes)
 583     {
 584       /* We try to create a new thread for this file descriptor.  The
 585          function which gets called will handle all available requests
 586          for this descriptor and when all are processed it will
 587          terminate.
 588
 589          If no new thread can be created or if the specified limit of
 590          threads for AIO is reached we queue the request.  */
 591
 592       /* See if we need to and are able to create a thread.  */
 593       if (nthreads < optim.aio_threads && idle_thread_count == 0)
 594         {
 595           pthread_t thid;
 596           pthread_attr_t attr;
 597
 598           /* Make sure the thread is created detached.  */
 599           pthread_attr_init (&attr);
 600           pthread_attr_setdetachstate (&attr, PTHREAD_CREATE_DETACHED);
 601
 602           running = newp->running = allocated;
 603
 604           /* Now try to start a thread.  */
 605           if (pthread_create (&thid, &attr, handle_fildes_io, newp) == 0)
 606             /* We managed to enqueue the request.  All errors which can
 607                happen now can be recognized by calls to `aio_return' and
 608                `aio_error'.  */
 609             ++nthreads;
 610           else
 611             {
 612               /* Reset the running flag.  The new request is not running.  */
 613               running = newp->running = yes;
 614
 615               if (nthreads == 0)
 616                 /* We cannot create a thread in the moment and there is
 617                    also no thread running.  This is a problem.  `errno' is
 618                    set to EAGAIN if this is only a temporary problem.  */
 619                 result = -1;
 620             }
 621         }
 622     }
 623
 624   /* Enqueue the request in the run queue if it is not yet running.  */
 625   if (running == yes && result == 0)
 626     {
 627       add_request_to_runlist (newp);
 628
 629       /* If there is a thread waiting for work, then let it know that we
 630          have just given it something to do. */
 631       if (idle_thread_count > 0)
 632         pthread_cond_signal (&__aio_new_request_notification);
 633     }
 634
 635   if (result == 0)
 636     newp->running = running;
 637   return result;
 638 }
 639
 640 /* The main function of the async I/O handling.  It enqueues requests
 641    and if necessary starts and handles threads.  */
 642 struct requestlist *
 643 internal_function
 644 __aio_enqueue_request_ctx (aiocb_union *aiocbp, int operation, kctx_t kctx)
 645 {
 646   int policy, prio;
 647   struct sched_param param;
 648   struct requestlist *newp;
 649   int op = (operation & 0xffff);
 650
 651   if (op == LIO_SYNC || op == LIO_DSYNC)
 652     {
 653       aiocbp->aiocb.aio_reqprio = 0;
 654       /* FIXME: Kernel doesn't support sync yet.  */
 655       operation &= ~LIO_KTHREAD;
 656     }
 657   else if (aiocbp->aiocb.aio_reqprio < 0
 658            || aiocbp->aiocb.aio_reqprio > AIO_PRIO_DELTA_MAX)
 659     {
 660       /* Invalid priority value.  */
 661       __set_errno (EINVAL);
 662       aiocbp->aiocb.__error_code = EINVAL;
 663       aiocbp->aiocb.__return_value = -1;
 664       return NULL;
 665     }
 666
 667   /* Compute priority for this request.  */
 668   pthread_getschedparam (pthread_self (), &policy, &param);
 669   prio = param.sched_priority - aiocbp->aiocb.aio_reqprio;
 670
 671   /* Get the mutex.  */
 672   pthread_mutex_lock (&__aio_requests_mutex);
 673
 674   if (operation & LIO_KTHREAD)
 675     {
 676       if (__aio_kioctx == KCTX_NONE && !__have_no_kernel_aio)
 677         {
 678           int res;
 679           INTERNAL_SYSCALL_DECL (err);
 680
 681           __aio_kioctx = 0;
 682           do
 683             res = INTERNAL_SYSCALL (io_setup, err, 2, 1024, &__aio_kioctx);
 684           while (INTERNAL_SYSCALL_ERROR_P (res, err)
 685                  && INTERNAL_SYSCALL_ERRNO (res, err) == EINTR);
 686           if (INTERNAL_SYSCALL_ERROR_P (res, err))
 687             {
 688               __have_no_kernel_aio = 1;
 689               __aio_kioctx = KCTX_NONE;
 690             }
 691         }
 692
 693       kctx = __aio_kioctx;
 694
 695       if (kctx != KCTX_NONE && !__kernel_thread_started
 696           && ((operation & LIO_KTHREAD_REQUIRED)
 697               || aiocbp->aiocb.aio_sigevent.sigev_notify != SIGEV_NONE))
 698         {
 699           if (__aio_create_kernel_thread () < 0)
 700             kctx = KCTX_NONE;
 701         }
 702     }
 703
 704   /* Get a new element for the waiting list.  */
 705   newp = get_elem ();
 706   if (newp == NULL)
 707     {
 708       pthread_mutex_unlock (&__aio_requests_mutex);
 709       __set_errno (EAGAIN);
 710       return NULL;
 711     }
 712   newp->aiocbp = aiocbp;
 713 #ifdef BROKEN_THREAD_SIGNALS
 714   newp->caller_pid = (aiocbp->aiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL
 715                       ? getpid () : 0);
 716 #endif
 717   newp->waiting = NULL;
 718   newp->kioctx = kctx;
 719
 720   aiocbp->aiocb.__abs_prio = prio;
 721   aiocbp->aiocb.__policy = policy;
 722   aiocbp->aiocb.aio_lio_opcode = op;
 723   aiocbp->aiocb.__error_code = EINPROGRESS;
 724   aiocbp->aiocb.__return_value = 0;
 725
 726   if (newp->kioctx != KCTX_NONE)
 727     {
 728       int res;
 729       INTERNAL_SYSCALL_DECL (err);
 730
 731       aiocb_union *aiocbp = newp->aiocbp;
 732       struct kiocb *kiocbs[] __attribute__((unused)) = { &newp->kiocb };
 733
 734       newp->kiocb.kiocb_data = (uintptr_t) kernel_callback;
 735       switch (op & 127)
 736         {
 737         case LIO_READ: newp->kiocb.kiocb_lio_opcode = IO_CMD_PREAD; break;
 738         case LIO_WRITE: newp->kiocb.kiocb_lio_opcode = IO_CMD_PWRITE; break;
 739         case LIO_SYNC:
 740         case LIO_DSYNC: newp->kiocb.kiocb_lio_opcode = IO_CMD_FSYNC; break;
 741         }
 742       if (op & 128)
 743         newp->kiocb.kiocb_offset = aiocbp->aiocb64.aio_offset;
 744       else
 745         newp->kiocb.kiocb_offset = aiocbp->aiocb.aio_offset;
 746       newp->kiocb.kiocb_fildes = aiocbp->aiocb.aio_fildes;
 747       newp->kiocb.kiocb_buf = (uintptr_t) aiocbp->aiocb.aio_buf;
 748       newp->kiocb.kiocb_nbytes = aiocbp->aiocb.aio_nbytes;
 749       /* FIXME.  */
 750       newp->kiocb.kiocb_req_prio = 0;
 751       res = INTERNAL_SYSCALL (io_submit, err, 3, newp->kioctx, 1, kiocbs);
 752       if (! INTERNAL_SYSCALL_ERROR_P (res, err))
 753         {
 754           newp->running = allocated;
 755           add_request_to_list (newp, aiocbp->aiocb.aio_fildes, prio);
 756           /* Release the mutex.  */
 757           pthread_mutex_unlock (&__aio_requests_mutex);
 758           return newp;
 759         }
 760       newp->kioctx = KCTX_NONE;
 761     }
 762
 763   if (__aio_enqueue_user_request (newp))
 764     {
 765       /* Something went wrong.  */
 766       __aio_free_request (newp);
 767       newp = NULL;
 768     }
 769
 770   /* Release the mutex.  */
 771   pthread_mutex_unlock (&__aio_requests_mutex);
 772
 773   return newp;
 774 }
 775
 776
 777 static int
 778 wait_for_kernel_requests (int fildes)
 779 {
 780   pthread_mutex_lock (&__aio_requests_mutex);
 781
 782   struct requestlist *kreq = __aio_find_kreq_fd (fildes), *req;
 783   int nent = 0;
 784   int ret = 0;
 785
 786   req = kreq;
 787   while (req)
 788     {
 789       if (req->running == allocated)
 790         ++nent;
 791       req = req->next_prio;
 792     }
 793
 794   if (nent)
 795     {
 796       if (__aio_create_kernel_thread () < 0)
 797         {
 798           pthread_mutex_unlock (&__aio_requests_mutex);
 799           return -1;
 800         }
 801
 802       pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
 803       struct waitlist waitlist[nent];
 804       int cnt = 0;
 805
 806       while (kreq)
 807         {
 808           if (kreq->running == allocated)
 809             {
 810               waitlist[cnt].cond = &cond;
 811               waitlist[cnt].next = kreq->waiting;
 812               waitlist[cnt].counterp = &nent;
 813               waitlist[cnt].sigevp = NULL;
 814 #ifdef BROKEN_THREAD_SIGNALS
 815               waitlist[cnt].caller_pid = 0;   /* Not needed.  */
 816 #endif
 817               kreq->waiting = &waitlist[cnt++];
 818             }
 819           kreq = kreq->next_prio;
 820         }
 821
 822       do
 823         pthread_cond_wait (&cond, &__aio_requests_mutex);
 824       while (nent);
 825
 826       pthread_cond_destroy (&cond);
 827     }
 828
 829   pthread_mutex_unlock (&__aio_requests_mutex);
 830   return ret;
 831 }
 832
 833
 834 static void *
 835 __attribute__ ((noreturn))
 836 handle_fildes_io (void *arg)
 837 {
 838   pthread_t self = pthread_self ();
 839   struct sched_param param;
 840   struct requestlist *runp = (struct requestlist *) arg;
 841   aiocb_union *aiocbp;
 842   int policy;
 843   int fildes;
 844
 845   pthread_getschedparam (self, &policy, &param);
 846
 847   do
 848     {
 849       /* If runp is NULL, then we were created to service the work queue
 850          in general, not to handle any particular request. In that case we
 851          skip the "do work" stuff on the first pass, and go directly to the
 852          "get work off the work queue" part of this loop, which is near the
 853          end. */
 854       if (runp == NULL)
 855         pthread_mutex_lock (&__aio_requests_mutex);
 856       else
 857         {
 858           /* Hopefully this request is marked as running.  */
 859           assert (runp->running == allocated);
 860
 861           /* Update our variables.  */
 862           aiocbp = runp->aiocbp;
 863           fildes = aiocbp->aiocb.aio_fildes;
 864
 865           /* Change the priority to the requested value (if necessary).  */
 866           if (aiocbp->aiocb.__abs_prio != param.sched_priority
 867               || aiocbp->aiocb.__policy != policy)
 868             {
 869               param.sched_priority = aiocbp->aiocb.__abs_prio;
 870               policy = aiocbp->aiocb.__policy;
 871               pthread_setschedparam (self, policy, &param);
 872             }
 873
 874           /* Process request pointed to by RUNP.  We must not be disturbed
 875              by signals.  */
 876           if ((aiocbp->aiocb.aio_lio_opcode & 127) == LIO_READ)
 877             {
 878               if (aiocbp->aiocb.aio_lio_opcode & 128)
 879                 aiocbp->aiocb.__return_value =
 880                   TEMP_FAILURE_RETRY (__pread64 (fildes, (void *)
 881                                                  aiocbp->aiocb64.aio_buf,
 882                                                  aiocbp->aiocb64.aio_nbytes,
 883                                                  aiocbp->aiocb64.aio_offset));
 884               else
 885                 aiocbp->aiocb.__return_value =
 886                   TEMP_FAILURE_RETRY (pread (fildes,
 887                                              (void *) aiocbp->aiocb.aio_buf,
 888                                              aiocbp->aiocb.aio_nbytes,
 889                                              aiocbp->aiocb.aio_offset));
 890
 891               if (aiocbp->aiocb.__return_value == -1 && errno == ESPIPE)
 892                 /* The Linux kernel is different from others.  It returns
 893                    ESPIPE if using pread on a socket.  Other platforms
 894                    simply ignore the offset parameter and behave like
 895                    read.  */
 896                 aiocbp->aiocb.__return_value =
 897                   TEMP_FAILURE_RETRY (read (fildes,
 898                                             (void *) aiocbp->aiocb64.aio_buf,
 899                                             aiocbp->aiocb64.aio_nbytes));
 900             }
 901           else if ((aiocbp->aiocb.aio_lio_opcode & 127) == LIO_WRITE)
 902             {
 903               if (aiocbp->aiocb.aio_lio_opcode & 128)
 904                 aiocbp->aiocb.__return_value =
 905                   TEMP_FAILURE_RETRY (__pwrite64 (fildes, (const void *)
 906                                                   aiocbp->aiocb64.aio_buf,
 907                                                   aiocbp->aiocb64.aio_nbytes,
 908                                                   aiocbp->aiocb64.aio_offset));
 909               else
 910                 aiocbp->aiocb.__return_value =
 911                   TEMP_FAILURE_RETRY (__libc_pwrite (fildes, (const void *)
 912                                                      aiocbp->aiocb.aio_buf,
 913                                                      aiocbp->aiocb.aio_nbytes,
 914                                                      aiocbp->aiocb.aio_offset));
 915
 916               if (aiocbp->aiocb.__return_value == -1 && errno == ESPIPE)
 917                 /* The Linux kernel is different from others.  It returns
 918                    ESPIPE if using pwrite on a socket.  Other platforms
 919                    simply ignore the offset parameter and behave like
 920                    write.  */
 921                 aiocbp->aiocb.__return_value =
 922                   TEMP_FAILURE_RETRY (write (fildes,
 923                                              (void *) aiocbp->aiocb64.aio_buf,
 924                                              aiocbp->aiocb64.aio_nbytes));
 925             }
 926           else if (aiocbp->aiocb.aio_lio_opcode == LIO_DSYNC
 927                    || aiocbp->aiocb.aio_lio_opcode == LIO_SYNC)
 928             {
 929               if (wait_for_kernel_requests (fildes) < 0)
 930                 {
 931                   aiocbp->aiocb.__return_value = -1;
 932                   __set_errno (ENOMEM);
 933                 }
 934               else if (aiocbp->aiocb.aio_lio_opcode == LIO_DSYNC)
 935                 aiocbp->aiocb.__return_value =
 936                   TEMP_FAILURE_RETRY (fdatasync (fildes));
 937               else
 938                 aiocbp->aiocb.__return_value =
 939                   TEMP_FAILURE_RETRY (fsync (fildes));
 940             }
 941           else
 942             {
 943               /* This is an invalid opcode.  */
 944               aiocbp->aiocb.__return_value = -1;
 945               __set_errno (EINVAL);
 946             }
 947
 948           /* Get the mutex.  */
 949           pthread_mutex_lock (&__aio_requests_mutex);
 950
 951           /* In theory we would need here a write memory barrier since the
 952              callers test using aio_error() whether the request finished
 953              and once this value != EINPROGRESS the field __return_value
 954              must be committed to memory.
 955
 956              But since the pthread_mutex_lock call involves write memory
 957              barriers as well it is not necessary.  */
 958
 959           if (aiocbp->aiocb.__return_value == -1)
 960             aiocbp->aiocb.__error_code = errno;
 961           else
 962             aiocbp->aiocb.__error_code = 0;
 963
 964           /* Send the signal to notify about finished processing of the
 965              request.  */
 966           __aio_notify (runp);
 967
 968           /* For debugging purposes we reset the running flag of the
 969              finished request.  */
 970           assert (runp->running == allocated);
 971           runp->running = done;
 972
 973           /* Now dequeue the current request.  */
 974           __aio_remove_request (NULL, runp, 0);
 975           if (runp->next_prio != NULL)
 976             add_request_to_runlist (runp->next_prio);
 977
 978           /* Free the old element.  */
 979           __aio_free_request (runp);
 980         }
 981
 982       runp = runlist;
 983
 984       /* If the runlist is empty, then we sleep for a while, waiting for
 985          something to arrive in it. */
 986       if (runp == NULL && optim.aio_idle_time >= 0)
 987         {
 988           struct timeval now;
 989           struct timespec wakeup_time;
 990
 991           ++idle_thread_count;
 992           gettimeofday (&now, NULL);
 993           wakeup_time.tv_sec = now.tv_sec + optim.aio_idle_time;
 994           wakeup_time.tv_nsec = now.tv_usec * 1000;
 995           if (wakeup_time.tv_nsec > 1000000000)
 996             {
 997               wakeup_time.tv_nsec -= 1000000000;
 998               ++wakeup_time.tv_sec;
 999             }
1000           pthread_cond_timedwait (&__aio_new_request_notification,
1001                                   &__aio_requests_mutex,
1002                                   &wakeup_time);
1003           --idle_thread_count;
1004           runp = runlist;
1005         }
1006
1007       if (runp == NULL)
1008         --nthreads;
1009       else
1010         {
1011           assert (runp->running == yes);
1012           runp->running = allocated;
1013           runlist = runp->next_run;
1014
1015           /* If we have a request to process, and there's still another in
1016              the run list, then we need to either wake up or create a new
1017              thread to service the request that is still in the run list. */
1018           if (runlist != NULL)
1019             {
1020               /* There are at least two items in the work queue to work on.
1021                  If there are other idle threads, then we should wake them
1022                  up for these other work elements; otherwise, we should try
1023                  to create a new thread. */
1024               if (idle_thread_count > 0)
1025                 pthread_cond_signal (&__aio_new_request_notification);
1026               else if (nthreads < optim.aio_threads)
1027                 {
1028                   pthread_t thid;
1029                   pthread_attr_t attr;
1030
1031                   /* Make sure the thread is created detached.  */
1032                   pthread_attr_init (&attr);
1033                   pthread_attr_setdetachstate (&attr, PTHREAD_CREATE_DETACHED);
1034
1035                   /* Now try to start a thread. If we fail, no big deal,
1036                      because we know that there is at least one thread (us)
1037                      that is working on AIO operations. */
1038                   if (pthread_create (&thid, &attr, handle_fildes_io, NULL)
1039                       == 0)
1040                     ++nthreads;
1041                 }
1042             }
1043         }
1044
1045       /* Release the mutex.  */
1046       pthread_mutex_unlock (&__aio_requests_mutex);
1047     }
1048   while (runp != NULL);
1049
1050   pthread_exit (NULL);
1051 }
1052
1053
1054 /* Free allocated resources.  */
1055 libc_freeres_fn (free_res)
1056 {
1057   size_t row;
1058
1059   for (row = 0; row < pool_max_size; ++row)
1060     free (pool[row]);
1061
1062   free (pool);
1063 }
1064
1065
1066 /* Add newrequest to the runlist. The __abs_prio flag of newrequest must
1067    be correctly set to do this. Also, you had better set newrequest's
1068    "running" flag to "yes" before you release your lock or you'll throw an
1069    assertion. */
1070 static void
1071 internal_function
1072 add_request_to_runlist (struct requestlist *newrequest)
1073 {
1074   int prio = newrequest->aiocbp->aiocb.__abs_prio;
1075   struct requestlist *runp;
1076
1077   if (runlist == NULL || runlist->aiocbp->aiocb.__abs_prio < prio)
1078     {
1079       newrequest->next_run = runlist;
1080       runlist = newrequest;
1081     }
1082   else
1083     {
1084       runp = runlist;
1085
1086       while (runp->next_run != NULL
1087              && runp->next_run->aiocbp->aiocb.__abs_prio >= prio)
1088         runp = runp->next_run;
1089
1090       newrequest->next_run = runp->next_run;
1091       runp->next_run = newrequest;
1092     }
1093 }
1094 #endif