3 * Copyright (C) 1992 Krishna Balasubramanian
4 * Copyright (C) 1995 Eric Schenk, Bruno Haible
6 * IMPLEMENTATION NOTES ON CODE REWRITE (Eric Schenk, January 1995):
7 * This code underwent a massive rewrite in order to solve some problems
8 * with the original code. In particular the original code failed to
9 * wake up processes that were waiting for semval to go to 0 if the
10 * value went to 0 and was then incremented rapidly enough. In solving
11 * this problem I have also modified the implementation so that it
12 * processes pending operations in a FIFO manner, thus give a guarantee
13 * that processes waiting for a lock on the semaphore won't starve
14 * unless another locking process fails to unlock.
15 * In addition the following two changes in behavior have been introduced:
16 * - The original implementation of semop returned the value
17 * last semaphore element examined on success. This does not
18 * match the manual page specifications, and effectively
19 * allows the user to read the semaphore even if they do not
20 * have read permissions. The implementation now returns 0
21 * on success as stated in the manual page.
22 * - There is some confusion over whether the set of undo adjustments
23 * to be performed at exit should be done in an atomic manner.
24 * That is, if we are attempting to decrement the semval should we queue
25 * up and wait until we can do so legally?
26 * The original implementation attempted to do this.
27 * The current implementation does not do so. This is because I don't
28 * think it is the right thing (TM) to do, and because I couldn't
29 * see a clean way to get the old behavior with the new design.
30 * The POSIX standard and SVID should be consulted to determine
31 * what behavior is mandated.
33 * Further notes on refinement (Christoph Rohland, December 1998):
34 * - The POSIX standard says, that the undo adjustments simply should
35 * redo. So the current implementation is o.K.
36 * - The previous code had two flaws:
37 * 1) It actively gave the semaphore to the next waiting process
38 * sleeping on the semaphore. Since this process did not have the
39 * cpu this led to many unnecessary context switches and bad
40 * performance. Now we only check which process should be able to
41 * get the semaphore and if this process wants to reduce some
42 * semaphore value we simply wake it up without doing the
43 * operation. So it has to try to get it later. Thus e.g. the
44 * running process may reaquire the semaphore during the current
45 * time slice. If it only waits for zero or increases the semaphore,
46 * we do the operation in advance and wake it up.
47 * 2) It did not wake up all zero waiting processes. We try to do
48 * better but only get the semops right which only wait for zero or
49 * increase. If there are decrement operations in the operations
50 * array we do the same as before.
52 * /proc/sysvipc/sem support (c) 1999 Dragos Acostachioaie <dragos@iname.com>
55 #include <linux/config.h>
56 #include <linux/malloc.h>
57 #include <linux/smp_lock.h>
58 #include <linux/init.h>
59 #include <linux/proc_fs.h>
61 #include <asm/uaccess.h>
63 extern int ipcperms (struct ipc_perm
*ipcp
, short semflg
);
64 static int newary (key_t
, int, int);
65 static int findkey (key_t key
);
66 static void freeary (int id
);
68 static int sysvipc_sem_read_proc(char *buffer
, char **start
, off_t offset
, int length
, int *eof
, void *data
);
71 static struct semid_ds
*semary
[SEMMNI
];
72 static int used_sems
= 0, used_semids
= 0;
73 static DECLARE_WAIT_QUEUE_HEAD(sem_lock
);
74 static int max_semid
= 0;
76 static unsigned short sem_seq
= 0;
78 void __init
sem_init (void)
82 struct proc_dir_entry
*ent
;
85 init_waitqueue_head(&sem_lock
);
86 used_sems
= used_semids
= max_semid
= sem_seq
= 0;
87 for (i
= 0; i
< SEMMNI
; i
++)
88 semary
[i
] = (struct semid_ds
*) IPC_UNUSED
;
90 ent
= create_proc_entry("sysvipc/sem", 0, 0);
91 ent
->read_proc
= sysvipc_sem_read_proc
;
96 static int findkey (key_t key
)
101 for (id
= 0; id
<= max_semid
; id
++) {
102 while ((sma
= semary
[id
]) == IPC_NOID
)
103 interruptible_sleep_on (&sem_lock
);
104 if (sma
== IPC_UNUSED
)
106 if (key
== sma
->sem_perm
.key
)
112 static int newary (key_t key
, int nsems
, int semflg
)
115 struct semid_ds
*sma
;
116 struct ipc_perm
*ipcp
;
121 if (used_sems
+ nsems
> SEMMNS
)
123 for (id
= 0; id
< SEMMNI
; id
++)
124 if (semary
[id
] == IPC_UNUSED
) {
125 semary
[id
] = (struct semid_ds
*) IPC_NOID
;
130 size
= sizeof (*sma
) + nsems
* sizeof (struct sem
);
132 sma
= (struct semid_ds
*) kmalloc (size
, GFP_KERNEL
);
134 semary
[id
] = (struct semid_ds
*) IPC_UNUSED
;
139 memset (sma
, 0, size
);
140 sma
->sem_base
= (struct sem
*) &sma
[1];
141 ipcp
= &sma
->sem_perm
;
142 ipcp
->mode
= (semflg
& S_IRWXUGO
);
144 ipcp
->cuid
= ipcp
->uid
= current
->euid
;
145 ipcp
->gid
= ipcp
->cgid
= current
->egid
;
146 sma
->sem_perm
.seq
= sem_seq
;
147 /* sma->sem_pending = NULL; */
148 sma
->sem_pending_last
= &sma
->sem_pending
;
149 /* sma->undo = NULL; */
150 sma
->sem_nsems
= nsems
;
151 sma
->sem_ctime
= CURRENT_TIME
;
157 return (unsigned int) sma
->sem_perm
.seq
* SEMMNI
+ id
;
160 asmlinkage
int sys_semget (key_t key
, int nsems
, int semflg
)
162 int id
, err
= -EINVAL
;
163 struct semid_ds
*sma
;
166 if (nsems
< 0 || nsems
> SEMMSL
)
168 if (key
== IPC_PRIVATE
) {
169 err
= newary(key
, nsems
, semflg
);
170 } else if ((id
= findkey (key
)) == -1) { /* key not used */
171 if (!(semflg
& IPC_CREAT
))
174 err
= newary(key
, nsems
, semflg
);
175 } else if (semflg
& IPC_CREAT
&& semflg
& IPC_EXCL
) {
179 if (nsems
> sma
->sem_nsems
)
181 else if (ipcperms(&sma
->sem_perm
, semflg
))
184 err
= (int) sma
->sem_perm
.seq
* SEMMNI
+ id
;
191 /* Manage the doubly linked list sma->sem_pending as a FIFO:
192 * insert new queue elements at the tail sma->sem_pending_last.
194 static inline void append_to_queue (struct semid_ds
* sma
,
195 struct sem_queue
* q
)
197 *(q
->prev
= sma
->sem_pending_last
) = q
;
198 *(sma
->sem_pending_last
= &q
->next
) = NULL
;
201 static inline void prepend_to_queue (struct semid_ds
* sma
,
202 struct sem_queue
* q
)
204 q
->next
= sma
->sem_pending
;
205 *(q
->prev
= &sma
->sem_pending
) = q
;
207 q
->next
->prev
= &q
->next
;
208 else /* sma->sem_pending_last == &sma->sem_pending */
209 sma
->sem_pending_last
= &q
->next
;
212 static inline void remove_from_queue (struct semid_ds
* sma
,
213 struct sem_queue
* q
)
215 *(q
->prev
) = q
->next
;
217 q
->next
->prev
= q
->prev
;
218 else /* sma->sem_pending_last == &q->next */
219 sma
->sem_pending_last
= q
->prev
;
220 q
->prev
= NULL
; /* mark as removed */
224 * Determine whether a sequence of semaphore operations would succeed
225 * all at once. Return 0 if yes, 1 if need to sleep, else return error code.
228 static int try_atomic_semop (struct semid_ds
* sma
, struct sembuf
* sops
,
229 int nsops
, struct sem_undo
*un
, int pid
,
236 for (sop
= sops
; sop
< sops
+ nsops
; sop
++) {
237 curr
= sma
->sem_base
+ sop
->sem_num
;
238 sem_op
= sop
->sem_op
;
240 if (!sem_op
&& curr
->semval
)
243 curr
->sempid
= (curr
->sempid
<< 16) | pid
;
244 curr
->semval
+= sem_op
;
245 if (sop
->sem_flg
& SEM_UNDO
)
246 un
->semadj
[sop
->sem_num
] -= sem_op
;
248 if (curr
->semval
< 0)
250 if (curr
->semval
> SEMVMX
)
261 sma
->sem_otime
= CURRENT_TIME
;
269 if (sop
->sem_flg
& IPC_NOWAIT
)
275 while (sop
>= sops
) {
276 curr
= sma
->sem_base
+ sop
->sem_num
;
277 curr
->semval
-= sop
->sem_op
;
280 if (sop
->sem_flg
& SEM_UNDO
)
281 un
->semadj
[sop
->sem_num
] += sop
->sem_op
;
288 /* Go through the pending queue for the indicated semaphore
289 * looking for tasks that can be completed.
291 static void update_queue (struct semid_ds
* sma
)
294 struct sem_queue
* q
;
296 for (q
= sma
->sem_pending
; q
; q
= q
->next
) {
299 return; /* wait for other process */
301 error
= try_atomic_semop(sma
, q
->sops
, q
->nsops
,
302 q
->undo
, q
->pid
, q
->alter
);
304 /* Does q->sleeper still need to sleep? */
306 /* Found one, wake it up */
307 wake_up_interruptible(&q
->sleeper
);
308 if (error
== 0 && q
->alter
) {
309 /* if q-> alter let it self try */
314 remove_from_queue(sma
,q
);
319 /* The following counts are associated to each semaphore:
320 * semncnt number of tasks waiting on semval being nonzero
321 * semzcnt number of tasks waiting on semval being zero
322 * This model assumes that a task waits on exactly one semaphore.
323 * Since semaphore operations are to be performed atomically, tasks actually
324 * wait on a whole sequence of semaphores simultaneously.
325 * The counts we return here are a rough approximation, but still
326 * warrant that semncnt+semzcnt>0 if the task is on the pending queue.
328 static int count_semncnt (struct semid_ds
* sma
, ushort semnum
)
331 struct sem_queue
* q
;
334 for (q
= sma
->sem_pending
; q
; q
= q
->next
) {
335 struct sembuf
* sops
= q
->sops
;
336 int nsops
= q
->nsops
;
338 for (i
= 0; i
< nsops
; i
++)
339 if (sops
[i
].sem_num
== semnum
340 && (sops
[i
].sem_op
< 0)
341 && !(sops
[i
].sem_flg
& IPC_NOWAIT
))
346 static int count_semzcnt (struct semid_ds
* sma
, ushort semnum
)
349 struct sem_queue
* q
;
352 for (q
= sma
->sem_pending
; q
; q
= q
->next
) {
353 struct sembuf
* sops
= q
->sops
;
354 int nsops
= q
->nsops
;
356 for (i
= 0; i
< nsops
; i
++)
357 if (sops
[i
].sem_num
== semnum
358 && (sops
[i
].sem_op
== 0)
359 && !(sops
[i
].sem_flg
& IPC_NOWAIT
))
365 /* Free a semaphore set. */
366 static void freeary (int id
)
368 struct semid_ds
*sma
= semary
[id
];
372 /* Invalidate this semaphore set */
374 sem_seq
= (sem_seq
+1) % ((unsigned)(1<<31)/SEMMNI
); /* increment, but avoid overflow */
375 used_sems
-= sma
->sem_nsems
;
377 while (max_semid
&& (semary
[--max_semid
] == IPC_UNUSED
));
378 semary
[id
] = (struct semid_ds
*) IPC_UNUSED
;
381 /* Invalidate the existing undo structures for this semaphore set.
382 * (They will be freed without any further action in sem_exit().)
384 for (un
= sma
->undo
; un
; un
= un
->id_next
)
387 /* Wake up all pending processes and let them fail with EIDRM. */
388 for (q
= sma
->sem_pending
; q
; q
= q
->next
) {
391 wake_up_interruptible(&q
->sleeper
); /* doesn't sleep! */
397 asmlinkage
int sys_semctl (int semid
, int semnum
, int cmd
, union semun arg
)
399 struct semid_ds
*buf
= NULL
;
400 struct semid_ds tbuf
;
402 struct semid_ds
*sma
;
403 struct ipc_perm
*ipcp
;
404 struct sem
*curr
= NULL
;
407 ushort
*array
= NULL
;
408 ushort sem_io
[SEMMSL
];
412 if (semid
< 0 || semnum
< 0 || cmd
< 0)
419 struct seminfo seminfo
, *tmp
= arg
.__buf
;
420 seminfo
.semmni
= SEMMNI
;
421 seminfo
.semmns
= SEMMNS
;
422 seminfo
.semmsl
= SEMMSL
;
423 seminfo
.semopm
= SEMOPM
;
424 seminfo
.semvmx
= SEMVMX
;
425 seminfo
.semmnu
= SEMMNU
;
426 seminfo
.semmap
= SEMMAP
;
427 seminfo
.semume
= SEMUME
;
428 seminfo
.semusz
= SEMUSZ
;
429 seminfo
.semaem
= SEMAEM
;
430 if (cmd
== SEM_INFO
) {
431 seminfo
.semusz
= used_semids
;
432 seminfo
.semaem
= used_sems
;
435 if (copy_to_user (tmp
, &seminfo
, sizeof(struct seminfo
)))
444 if (semid
> max_semid
)
447 if (sma
== IPC_UNUSED
|| sma
== IPC_NOID
)
450 if (ipcperms (&sma
->sem_perm
, S_IRUGO
))
452 id
= (unsigned int) sma
->sem_perm
.seq
* SEMMNI
+ semid
;
453 tbuf
.sem_perm
= sma
->sem_perm
;
454 tbuf
.sem_otime
= sma
->sem_otime
;
455 tbuf
.sem_ctime
= sma
->sem_ctime
;
456 tbuf
.sem_nsems
= sma
->sem_nsems
;
458 if (copy_to_user (buf
, &tbuf
, sizeof(*buf
)) == 0)
463 id
= (unsigned int) semid
% SEMMNI
;
466 if (sma
== IPC_UNUSED
|| sma
== IPC_NOID
)
468 ipcp
= &sma
->sem_perm
;
469 nsems
= sma
->sem_nsems
;
471 if (sma
->sem_perm
.seq
!= (unsigned int) semid
/ SEMMNI
)
483 curr
= &sma
->sem_base
[semnum
];
494 if (ipcperms (ipcp
, S_IRUGO
))
497 case GETVAL
: err
= curr
->semval
; goto out
;
498 case GETPID
: err
= curr
->sempid
& 0xffff; goto out
;
499 case GETNCNT
: err
= count_semncnt(sma
,semnum
); goto out
;
500 case GETZCNT
: err
= count_semzcnt(sma
,semnum
); goto out
;
509 if (val
> SEMVMX
|| val
< 0)
513 if (current
->euid
== ipcp
->cuid
||
514 current
->euid
== ipcp
->uid
|| capable(CAP_SYS_ADMIN
)) {
521 case SETALL
: /* arg is a pointer to an array of ushort */
524 if (copy_from_user (sem_io
, array
, nsems
*sizeof(ushort
)))
527 for (i
= 0; i
< nsems
; i
++)
528 if (sem_io
[i
] > SEMVMX
) {
538 err
= copy_from_user (&tbuf
, buf
, sizeof (*buf
));
545 if (semary
[id
] == IPC_UNUSED
|| semary
[id
] == IPC_NOID
)
547 if (sma
->sem_perm
.seq
!= (unsigned int) semid
/ SEMMNI
)
553 if (ipcperms (ipcp
, S_IRUGO
))
555 for (i
= 0; i
< sma
->sem_nsems
; i
++)
556 sem_io
[i
] = sma
->sem_base
[i
].semval
;
557 if (copy_to_user (array
, sem_io
, nsems
*sizeof(ushort
)))
562 if (ipcperms (ipcp
, S_IWUGO
))
564 for (un
= sma
->undo
; un
; un
= un
->id_next
)
565 un
->semadj
[semnum
] = 0;
567 sma
->sem_ctime
= CURRENT_TIME
;
568 /* maybe some queued-up processes were waiting for this */
572 if (current
->euid
== ipcp
->cuid
||
573 current
->euid
== ipcp
->uid
|| capable(CAP_SYS_ADMIN
)) {
574 ipcp
->uid
= tbuf
.sem_perm
.uid
;
575 ipcp
->gid
= tbuf
.sem_perm
.gid
;
576 ipcp
->mode
= (ipcp
->mode
& ~S_IRWXUGO
)
577 | (tbuf
.sem_perm
.mode
& S_IRWXUGO
);
578 sma
->sem_ctime
= CURRENT_TIME
;
586 if (ipcperms (ipcp
, S_IRUGO
))
588 tbuf
.sem_perm
= sma
->sem_perm
;
589 tbuf
.sem_otime
= sma
->sem_otime
;
590 tbuf
.sem_ctime
= sma
->sem_ctime
;
591 tbuf
.sem_nsems
= sma
->sem_nsems
;
592 if (copy_to_user (buf
, &tbuf
, sizeof(*buf
)))
597 if (ipcperms (ipcp
, S_IWUGO
))
599 for (i
= 0; i
< nsems
; i
++)
600 sma
->sem_base
[i
].semval
= sem_io
[i
];
601 for (un
= sma
->undo
; un
; un
= un
->id_next
)
602 for (i
= 0; i
< nsems
; i
++)
604 sma
->sem_ctime
= CURRENT_TIME
;
605 /* maybe some queued-up processes were waiting for this */
618 asmlinkage
int sys_semop (int semid
, struct sembuf
*tsops
, unsigned nsops
)
620 int id
, size
, error
= -EINVAL
;
621 struct semid_ds
*sma
;
622 struct sembuf sops
[SEMOPM
], *sop
;
624 int undos
= 0, decrease
= 0, alter
= 0;
625 struct sem_queue queue
;
628 if (nsops
< 1 || semid
< 0)
634 if (copy_from_user (sops
, tsops
, nsops
* sizeof(*tsops
)))
636 id
= (unsigned int) semid
% SEMMNI
;
638 if ((sma
= semary
[id
]) == IPC_UNUSED
|| sma
== IPC_NOID
)
641 if (sma
->sem_perm
.seq
!= (unsigned int) semid
/ SEMMNI
)
645 for (sop
= sops
; sop
< sops
+ nsops
; sop
++) {
646 if (sop
->sem_num
>= sma
->sem_nsems
)
648 if (sop
->sem_flg
& SEM_UNDO
)
658 if (ipcperms(&sma
->sem_perm
, alter
? S_IWUGO
: S_IRUGO
))
661 /* Make sure we have an undo structure
662 * for this process and this semaphore set.
664 for (un
= current
->semundo
; un
; un
= un
->proc_next
)
665 if (un
->semid
== semid
)
668 size
= sizeof(struct sem_undo
) + sizeof(short)*sma
->sem_nsems
;
669 un
= (struct sem_undo
*) kmalloc(size
, GFP_ATOMIC
);
675 un
->semadj
= (short *) &un
[1];
677 un
->proc_next
= current
->semundo
;
678 current
->semundo
= un
;
679 un
->id_next
= sma
->undo
;
685 error
= try_atomic_semop (sma
, sops
, nsops
, un
, current
->pid
, 0);
689 /* We need to sleep on this operation, so we put the current
690 * task into the pending queue and go to sleep.
697 queue
.pid
= current
->pid
;
698 queue
.alter
= decrease
;
699 current
->semsleeping
= &queue
;
701 append_to_queue(sma
,&queue
);
703 prepend_to_queue(sma
,&queue
);
706 queue
.status
= -EINTR
;
707 init_waitqueue_head(&queue
.sleeper
);
708 interruptible_sleep_on(&queue
.sleeper
);
711 * If queue.status == 1 we where woken up and
712 * have to retry else we simply return.
713 * If an interrupt occurred we have to clean up the
717 if (queue
.status
== 1)
719 error
= try_atomic_semop (sma
, sops
, nsops
, un
,
724 error
= queue
.status
;;
725 if (queue
.prev
) /* got Interrupt */
727 /* Everything done by update_queue */
728 current
->semsleeping
= NULL
;
732 current
->semsleeping
= NULL
;
733 remove_from_queue(sma
,&queue
);
743 * add semadj values to semaphores, free undo structures.
744 * undo structures are not freed when semaphore arrays are destroyed
745 * so some of them may be out of date.
746 * IMPLEMENTATION NOTE: There is some confusion over whether the
747 * set of adjustments that needs to be done should be done in an atomic
748 * manner or not. That is, if we are attempting to decrement the semval
749 * should we queue up and wait until we can do so legally?
750 * The original implementation attempted to do this (queue and wait).
751 * The current implementation does not do so. The POSIX standard
752 * and SVID should be consulted to determine what behavior is mandated.
757 struct sem_undo
*u
, *un
= NULL
, **up
, **unp
;
758 struct semid_ds
*sma
;
761 /* If the current process was sleeping for a semaphore,
762 * remove it from the queue.
764 if ((q
= current
->semsleeping
)) {
766 remove_from_queue(q
->sma
,q
);
767 current
->semsleeping
= NULL
;
770 for (up
= ¤t
->semundo
; (u
= *up
); *up
= u
->proc_next
, kfree(u
)) {
773 sma
= semary
[(unsigned int) u
->semid
% SEMMNI
];
774 if (sma
== IPC_UNUSED
|| sma
== IPC_NOID
)
776 if (sma
->sem_perm
.seq
!= (unsigned int) u
->semid
/ SEMMNI
)
778 /* remove u from the sma->undo list */
779 for (unp
= &sma
->undo
; (un
= *unp
); unp
= &un
->id_next
) {
783 printk ("sem_exit undo list error id=%d\n", u
->semid
);
787 /* perform adjustments registered in u */
788 nsems
= sma
->sem_nsems
;
789 for (i
= 0; i
< nsems
; i
++) {
790 struct sem
* sem
= &sma
->sem_base
[i
];
791 sem
->semval
+= u
->semadj
[i
];
793 sem
->semval
= 0; /* shouldn't happen */
794 sem
->sempid
= current
->pid
;
796 sma
->sem_otime
= CURRENT_TIME
;
797 /* maybe some queued-up processes were waiting for this */
800 current
->semundo
= NULL
;
803 #ifdef CONFIG_PROC_FS
804 static int sysvipc_sem_read_proc(char *buffer
, char **start
, off_t offset
, int length
, int *eof
, void *data
)
810 len
+= sprintf(buffer
, " key semid perms nsems uid gid cuid cgid otime ctime\n");
812 for(i
= 0; i
< SEMMNI
; i
++)
813 if(semary
[i
] != IPC_UNUSED
) {
814 len
+= sprintf(buffer
+ len
, "%10d %10d %4o %5u %5u %5u %5u %5u %10lu %10lu\n",
815 semary
[i
]->sem_perm
.key
,
816 semary
[i
]->sem_perm
.seq
* SEMMNI
+ i
,
817 semary
[i
]->sem_perm
.mode
,
818 semary
[i
]->sem_nsems
,
819 semary
[i
]->sem_perm
.uid
,
820 semary
[i
]->sem_perm
.gid
,
821 semary
[i
]->sem_perm
.cuid
,
822 semary
[i
]->sem_perm
.cgid
,
823 semary
[i
]->sem_otime
,
824 semary
[i
]->sem_ctime
);
831 if(pos
> offset
+ length
)
836 *start
= buffer
+ (offset
- begin
);
837 len
-= (offset
- begin
);