2 * Unix SMB/CIFS implementation.
3 * Support for OneFS system interfaces.
5 * Copyright (C) Tim Prouty, 2008
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, see <http://www.gnu.org/licenses/>.
22 #include "smbd/smbd.h"
24 #include "onefs_config.h"
25 #include "oplock_onefs.h"
27 #include <ifs/ifs_syscalls.h>
28 #include <isi_acl/isi_acl_util.h>
29 #include <sys/isi_acl.h>
32 * Initialize the sm_lock struct before passing it to ifs_createfile.
34 static void smlock_init(connection_struct
*conn
, struct sm_lock
*sml
,
35 bool isexe
, uint32_t access_mask
, uint32_t share_access
,
36 uint32_t create_options
)
38 sml
->sm_type
.doc
= false;
39 sml
->sm_type
.isexe
= isexe
;
40 sml
->sm_type
.statonly
= is_stat_open(access_mask
);
41 sml
->sm_type
.access_mask
= access_mask
;
42 sml
->sm_type
.share_access
= share_access
;
45 * private_options was previously used for DENY_DOS/DENY_FCB checks in
46 * the kernel, but are now properly handled by fcb_or_dos_open. In
47 * these cases, ifs_createfile will return a sharing violation, which
48 * gives fcb_or_dos_open the chance to open a duplicate file handle.
50 sml
->sm_type
.private_options
= 0;
52 /* 1 second delay is handled in onefs_open.c by deferring the open */
53 sml
->sm_timeout
= timeval_set(0, 0);
56 static void smlock_dump(int debuglevel
, const struct sm_lock
*sml
)
59 DEBUG(debuglevel
, ("sml == NULL\n"));
64 ("smlock: doc=%s, isexec=%s, statonly=%s, access_mask=0x%x, "
65 "share_access=0x%x, private_options=0x%x timeout=%d/%d\n",
66 sml
->sm_type
.doc
? "True" : "False",
67 sml
->sm_type
.isexe
? "True" : "False",
68 sml
->sm_type
.statonly
? "True" : "False",
69 sml
->sm_type
.access_mask
,
70 sml
->sm_type
.share_access
,
71 sml
->sm_type
.private_options
,
72 (int)sml
->sm_timeout
.tv_sec
,
73 (int)sml
->sm_timeout
.tv_usec
));
77 * External interface to ifs_createfile
79 int onefs_sys_create_file(connection_struct
*conn
,
83 uint32_t open_access_mask
,
84 uint32_t share_access
,
85 uint32_t create_options
,
90 struct security_descriptor
*sd
,
94 struct sm_lock sml
, *psml
= NULL
;
95 enum oplock_type onefs_oplock
;
96 enum oplock_type onefs_granted_oplock
= OPLOCK_NONE
;
97 struct ifs_security_descriptor ifs_sd
= {}, *pifs_sd
= NULL
;
98 uint32_t sec_info_effective
= 0;
100 uint32_t onefs_dos_attributes
;
101 struct ifs_createfile_flags cf_flags
= CF_FLAGS_NONE
;
102 char *mapped_name
= NULL
;
105 START_PROFILE(syscall_createfile
);
107 /* Translate the name to UNIX before calling ifs_createfile */
108 mapped_name
= talloc_strdup(talloc_tos(), path
);
109 if (mapped_name
== NULL
) {
113 result
= SMB_VFS_TRANSLATE_NAME(conn
, &mapped_name
,
114 vfs_translate_to_unix
);
115 if (!NT_STATUS_IS_OK(result
)) {
119 /* Setup security descriptor and get secinfo. */
122 uint32_t sec_info_sent
= 0;
124 sec_info_sent
= (get_sec_info(sd
) & IFS_SEC_INFO_KNOWN_MASK
);
126 status
= onefs_samba_sd_to_sd(sec_info_sent
, sd
, &ifs_sd
,
127 SNUM(conn
), &sec_info_effective
);
129 if (!NT_STATUS_IS_OK(status
)) {
130 DEBUG(1, ("SD initialization failure: %s\n",
139 /* Stripping off private bits will be done for us. */
140 onefs_oplock
= onefs_samba_oplock_to_oplock(oplock_request
);
142 if (!lp_oplocks(SNUM(conn
))) {
143 SMB_ASSERT(onefs_oplock
== OPLOCK_NONE
);
146 /* Convert samba dos flags to UF_DOS_* attributes. */
147 onefs_dos_attributes
= dos_attributes_to_stat_dos_flags(dos_flags
);
150 * Deal with kernel creating Default ACLs. (Isilon bug 47447.)
152 * 1) "nt acl support = no", default_acl = no
153 * 2) "inherit permissions = yes", default_acl = no
155 if (lp_nt_acl_support(SNUM(conn
)) && !lp_inherit_perms(SNUM(conn
)))
156 cf_flags
= cf_flags_or(cf_flags
, CF_FLAGS_DEFAULT_ACL
);
159 * Some customer workflows require the execute bit to be ignored.
161 if (lp_parm_bool(SNUM(conn
), PARM_ONEFS_TYPE
,
162 PARM_ALLOW_EXECUTE_ALWAYS
,
163 PARM_ALLOW_EXECUTE_ALWAYS_DEFAULT
) &&
164 (open_access_mask
& FILE_EXECUTE
)) {
166 DEBUG(3, ("Stripping execute bit from %s: (0x%x)\n", mapped_name
,
170 open_access_mask
&= ~FILE_EXECUTE
;
173 * Add READ_DATA, so we're not left with desired_access=0. An
174 * execute call should imply the client will read the data.
176 open_access_mask
|= FILE_READ_DATA
;
178 DEBUGADD(3, ("New stripped access mask: 0x%x\n",
182 DEBUG(10,("onefs_sys_create_file: base_fd = %d, fname = %s "
183 "open_access_mask = 0x%x, flags = 0x%x, mode = 0%o, "
184 "desired_oplock = %s, id = 0x%x, secinfo = 0x%x, sd = %p, "
185 "dos_attributes = 0x%x, path = %s, "
186 "default_acl=%s\n", base_fd
, mapped_name
,
187 (unsigned int)open_access_mask
,
190 onefs_oplock_str(onefs_oplock
),
192 sec_info_effective
, sd
,
193 (unsigned int)onefs_dos_attributes
, mapped_name
,
194 cf_flags_and_bool(cf_flags
, CF_FLAGS_DEFAULT_ACL
) ?
197 /* Initialize smlock struct for files/dirs but not internal opens */
198 if (!(oplock_request
& INTERNAL_OPEN_ONLY
)) {
199 smlock_init(conn
, &sml
, is_executable(mapped_name
), access_mask
,
200 share_access
, create_options
);
204 smlock_dump(10, psml
);
206 ret_fd
= ifs_createfile(base_fd
, mapped_name
,
207 (enum ifs_ace_rights
)open_access_mask
, flags
& ~O_ACCMODE
, mode
,
208 onefs_oplock
, id
, psml
, sec_info_effective
, pifs_sd
,
209 onefs_dos_attributes
, cf_flags
, &onefs_granted_oplock
);
211 DEBUG(10,("onefs_sys_create_file(%s): ret_fd = %d, "
212 "onefs_granted_oplock = %s\n",
213 ret_fd
< 0 ? strerror(errno
) : "success", ret_fd
,
214 onefs_oplock_str(onefs_granted_oplock
)));
216 if (granted_oplock
) {
218 onefs_oplock_to_samba_oplock(onefs_granted_oplock
);
222 END_PROFILE(syscall_createfile
);
223 aclu_free_sd(pifs_sd
, false);
224 TALLOC_FREE(mapped_name
);
230 * FreeBSD based sendfile implementation that allows for atomic semantics.
232 static ssize_t
onefs_sys_do_sendfile(int tofd
, int fromfd
,
233 const DATA_BLOB
*header
, SMB_OFF_T offset
, size_t count
, bool atomic
)
245 hdr
.headers
= &hdtrl
;
250 /* Set up the header iovec. */
252 hdtrl
.iov_base
= (void *)header
->data
;
253 hdtrl
.iov_len
= hdr_len
= header
->length
;
255 hdtrl
.iov_base
= NULL
;
260 while (total
+ hdtrl
.iov_len
) {
265 * FreeBSD sendfile returns 0 on success, -1 on error.
266 * Remember, the tofd and fromfd are reversed..... :-).
267 * nwritten includes the header data sent.
271 ret
= sendfile(fromfd
, tofd
, offset
, total
, &hdr
,
273 #if defined(EWOULDBLOCK)
274 } while (ret
== -1 && (errno
== EINTR
|| errno
== EAGAIN
|| errno
== EWOULDBLOCK
));
276 } while (ret
== -1 && (errno
== EINTR
|| errno
== EAGAIN
));
279 /* On error we're done. */
285 * If this was an ATOMIC sendfile, nwritten doesn't
286 * necessarily indicate an error. It could mean count > than
287 * what sendfile can handle atomically (usually 64K) or that
288 * there was a short read due to the file being truncated.
291 return atomic
? 0 : -1;
295 * An atomic sendfile should never send partial data!
297 if (atomic
&& nwritten
!= total
+ hdtrl
.iov_len
) {
298 DEBUG(0,("Atomic sendfile() sent partial data: "
299 "%llu of %d\n", nwritten
,
300 total
+ hdtrl
.iov_len
));
305 * If this was a short (signal interrupted) write we may need
306 * to subtract it from the header data, or null out the header
307 * data altogether if we wrote more than hdtrl.iov_len bytes.
308 * We change nwritten to be the number of file bytes written.
311 if (hdtrl
.iov_base
&& hdtrl
.iov_len
) {
312 if (nwritten
>= hdtrl
.iov_len
) {
313 nwritten
-= hdtrl
.iov_len
;
314 hdtrl
.iov_base
= NULL
;
318 (void *)((caddr_t
)hdtrl
.iov_base
+ nwritten
);
319 hdtrl
.iov_len
-= nwritten
;
326 return count
+ hdr_len
;
330 * Handles the subtleties of using sendfile with CIFS.
332 ssize_t
onefs_sys_sendfile(connection_struct
*conn
, int tofd
, int fromfd
,
333 const DATA_BLOB
*header
, SMB_OFF_T offset
,
339 START_PROFILE_BYTES(syscall_sendfile
, count
);
341 if (lp_parm_bool(SNUM(conn
), PARM_ONEFS_TYPE
,
342 PARM_ATOMIC_SENDFILE
,
343 PARM_ATOMIC_SENDFILE_DEFAULT
)) {
347 /* Try the sendfile */
348 ret
= onefs_sys_do_sendfile(tofd
, fromfd
, header
, offset
, count
,
351 /* If the sendfile wasn't atomic, we're done. */
353 DEBUG(10, ("non-atomic sendfile read %ul bytes\n", ret
));
354 END_PROFILE(syscall_sendfile
);
359 * Atomic sendfile takes care to not write anything to the socket
360 * until all of the requested bytes have been read from the file.
361 * There are two atomic cases that need to be handled.
363 * 1. The file was truncated causing less data to be read than was
364 * requested. In this case, we return back to the caller to
365 * indicate 0 bytes were written to the socket. This should
366 * prompt the caller to fallback to the standard read path: read
367 * the data, create a header that indicates how many bytes were
368 * actually read, and send the header/data back to the client.
370 * This saves us from standard sendfile behavior of sending a
371 * header promising more data then will actually be sent. The
372 * only two options are to close the socket and kill the client
373 * connection, or write a bunch of 0s. Closing the client
374 * connection is bad because there could actually be multiple
375 * sessions multiplexed from the same client that are all dropped
376 * because of a truncate. Writing the remaining data as 0s also
377 * isn't good, because the client will have an incorrect version
378 * of the file. If the file is written back to the server, the 0s
379 * will be written back. Fortunately, atomic sendfile allows us
380 * to avoid making this choice in most cases.
382 * 2. One downside of atomic sendfile, is that there is a limit on
383 * the number of bytes that can be sent atomically. The kernel
384 * has a limited amount of mbuf space that it can read file data
385 * into without exhausting the system's mbufs, so a buffer of
386 * length xfsize is used. The xfsize at the time of writing this
387 * is 64K. xfsize bytes are read from the file, and subsequently
388 * written to the socket. This makes it impossible to do the
389 * sendfile atomically for a byte count > xfsize.
391 * To cope with large requests, atomic sendfile returns -1 with
392 * errno set to E2BIG. Since windows maxes out at 64K writes,
393 * this is currently only a concern with non-windows clients.
394 * Posix extensions allow the full 24bit bytecount field to be
395 * used in ReadAndX, and clients such as smbclient and the linux
396 * cifs client can request up to 16MB reads! There are a few
397 * options for handling large sendfile requests.
399 * a. Fall back to the standard read path. This is unacceptable
400 * because it would require prohibitively large mallocs.
402 * b. Fall back to using samba's fake_send_file which emulates
403 * the kernel sendfile in userspace. This still has the same
404 * problem of sending the header before all of the data has
405 * been read, so it doesn't buy us anything, and has worse
406 * performance than the kernel's zero-copy sendfile.
408 * c. Use non-atomic sendfile syscall to attempt a zero copy
409 * read, and hope that there isn't a short read due to
410 * truncation. In the case of a short read, there are two
413 * 1. Kill the client connection
415 * 2. Write zeros to the socket for the remaining bytes
416 * promised in the header.
418 * It is safer from a data corruption perspective to kill the
419 * client connection, so this is our default behavior, but if
420 * this causes problems this can be configured to write zeros
424 /* Handle case 1: short read -> truncated file. */
426 END_PROFILE(syscall_sendfile
);
430 /* Handle case 2: large read. */
431 if (ret
== -1 && errno
== E2BIG
) {
433 if (!lp_parm_bool(SNUM(conn
), PARM_ONEFS_TYPE
,
434 PARM_SENDFILE_LARGE_READS
,
435 PARM_SENDFILE_LARGE_READS_DEFAULT
)) {
436 DEBUG(3, ("Not attempting non-atomic large sendfile: "
437 "%lu bytes\n", count
));
438 END_PROFILE(syscall_sendfile
);
442 if (count
< 0x10000) {
443 DEBUG(0, ("Count < 2^16 and E2BIG was returned! %lu\n",
447 DEBUG(10, ("attempting non-atomic large sendfile: %lu bytes\n",
450 /* Try a non-atomic sendfile. */
451 ret
= onefs_sys_do_sendfile(tofd
, fromfd
, header
, offset
,
453 /* Real error: kill the client connection. */
455 DEBUG(1, ("error on non-atomic large sendfile "
456 "(%lu bytes): %s\n", count
,
458 END_PROFILE(syscall_sendfile
);
462 /* Short read: kill the client connection. */
463 if (ret
!= count
+ header
->length
) {
464 DEBUG(1, ("short read on non-atomic large sendfile "
465 "(%lu of %lu bytes): %s\n", ret
, count
,
469 * Returning ret here would cause us to drop into the
470 * codepath that calls sendfile_short_send, which
471 * sends the client a bunch of zeros instead.
472 * Returning -1 kills the connection.
474 if (lp_parm_bool(SNUM(conn
), PARM_ONEFS_TYPE
,
476 PARM_SENDFILE_SAFE_DEFAULT
)) {
477 END_PROFILE(syscall_sendfile
);
481 END_PROFILE(syscall_sendfile
);
485 DEBUG(10, ("non-atomic large sendfile successful\n"));
488 /* There was error in the atomic sendfile. */
490 DEBUG(1, ("error on %s sendfile (%lu bytes): %s\n",
491 atomic
? "atomic" : "non-atomic",
492 count
, strerror(errno
)));
495 END_PROFILE(syscall_sendfile
);
500 * Only talloc the spill buffer once (reallocing when necessary).
502 static char *get_spill_buffer(size_t new_count
)
504 static int cur_count
= 0;
505 static char *spill_buffer
= NULL
;
507 /* If a sufficiently sized buffer exists, just return. */
508 if (new_count
<= cur_count
) {
509 SMB_ASSERT(spill_buffer
);
513 /* Allocate the first time. */
514 if (cur_count
== 0) {
515 SMB_ASSERT(!spill_buffer
);
516 spill_buffer
= talloc_array(NULL
, char, new_count
);
518 cur_count
= new_count
;
523 /* A buffer exists, but it's not big enough, so realloc. */
524 SMB_ASSERT(spill_buffer
);
525 spill_buffer
= talloc_realloc(NULL
, spill_buffer
, char, new_count
);
527 cur_count
= new_count
;
533 * recvfile does zero-copy writes given an fd to write to, and a socket with
534 * some data to write. If recvfile read more than it was able to write, it
535 * spills the data into a buffer. After first reading any additional data
536 * from the socket into the buffer, the spill buffer is then written with a
539 ssize_t
onefs_sys_recvfile(int fromfd
, int tofd
, SMB_OFF_T offset
,
542 char *spill_buffer
= NULL
;
543 bool socket_drained
= false;
545 off_t total_rbytes
= 0;
546 off_t total_wbytes
= 0;
550 START_PROFILE_BYTES(syscall_recvfile
, count
);
552 DEBUG(10,("onefs_recvfile: from = %d, to = %d, offset=%llu, count = "
553 "%lu\n", fromfd
, tofd
, offset
, count
));
556 END_PROFILE(syscall_recvfile
);
561 * Setup up a buffer for recvfile to spill data that has been read
562 * from the socket but not written.
564 spill_buffer
= get_spill_buffer(count
);
565 if (spill_buffer
== NULL
) {
571 * Keep trying recvfile until:
572 * - There is no data left to read on the socket, or
573 * - bytes read != bytes written, or
574 * - An error is returned that isn't EINTR/EAGAIN
577 /* Keep track of bytes read/written for recvfile */
581 DEBUG(10, ("calling recvfile loop, offset + total_wbytes = "
582 "%llu, count - total_rbytes = %llu\n",
583 offset
+ total_wbytes
, count
- total_rbytes
));
585 ret
= recvfile(tofd
, fromfd
, offset
+ total_wbytes
,
586 count
- total_wbytes
, &rbytes
, &wbytes
, 0,
589 DEBUG(10, ("recvfile ret = %d, errno = %d, rbytes = %llu, "
590 "wbytes = %llu\n", ret
, ret
>= 0 ? 0 : errno
,
593 /* Update our progress so far */
594 total_rbytes
+= rbytes
;
595 total_wbytes
+= wbytes
;
597 } while ((count
- total_rbytes
) && (rbytes
== wbytes
) &&
598 (ret
== -1 && (errno
== EINTR
|| errno
== EAGAIN
)));
600 DEBUG(10, ("total_rbytes = %llu, total_wbytes = %llu\n",
601 total_rbytes
, total_wbytes
));
603 /* Log if recvfile didn't write everything it read. */
604 if (total_rbytes
!= total_wbytes
) {
605 DEBUG(3, ("partial recvfile: total_rbytes=%llu but "
606 "total_wbytes=%llu, diff = %llu\n", total_rbytes
,
607 total_wbytes
, total_rbytes
- total_wbytes
));
608 SMB_ASSERT(total_rbytes
> total_wbytes
);
612 * If there is still data on the socket, read it off.
614 while (total_rbytes
< count
) {
616 DEBUG(3, ("shallow recvfile (%s), reading %llu\n",
617 strerror(errno
), count
- total_rbytes
));
620 * Read the remaining data into the spill buffer. recvfile
621 * may already have some data in the spill buffer, so start
622 * filling the buffer at total_rbytes - total_wbytes.
624 ret
= sys_read(fromfd
,
625 spill_buffer
+ (total_rbytes
- total_wbytes
),
626 count
- total_rbytes
);
630 DEBUG(0, ("shallow recvfile read: EOF\n"));
632 DEBUG(0, ("shallow recvfile read failed: %s\n",
635 /* Socket is dead, so treat as if it were drained. */
636 socket_drained
= true;
640 /* Data was read so update the rbytes */
644 if (total_rbytes
!= count
) {
645 smb_panic("Unread recvfile data still on the socket!");
649 * Now write any spilled data + the extra data read off the socket.
651 while (total_wbytes
< count
) {
653 DEBUG(3, ("partial recvfile, writing %llu\n", count
- total_wbytes
));
655 ret
= sys_pwrite(tofd
, spill_buffer
, count
- total_wbytes
,
656 offset
+ total_wbytes
);
659 DEBUG(0, ("partial recvfile write failed: %s\n",
664 /* Data was written so update the wbytes */
673 END_PROFILE(syscall_recvfile
);
675 /* Make sure we always try to drain the socket. */
676 if (!socket_drained
&& count
- total_rbytes
) {
677 int saved_errno
= errno
;
679 if (drain_socket(fromfd
, count
- total_rbytes
) !=
680 count
- total_rbytes
) {
681 /* Socket is dead! */
682 DEBUG(0, ("drain socket failed: %d\n", errno
));
690 void init_stat_ex_from_onefs_stat(struct stat_ex
*dst
, const struct stat
*src
)
694 dst
->st_ex_dev
= src
->st_dev
;
695 dst
->st_ex_ino
= src
->st_ino
;
696 dst
->st_ex_mode
= src
->st_mode
;
697 dst
->st_ex_nlink
= src
->st_nlink
;
698 dst
->st_ex_uid
= src
->st_uid
;
699 dst
->st_ex_gid
= src
->st_gid
;
700 dst
->st_ex_rdev
= src
->st_rdev
;
701 dst
->st_ex_size
= src
->st_size
;
702 dst
->st_ex_atime
= src
->st_atimespec
;
703 dst
->st_ex_mtime
= src
->st_mtimespec
;
704 dst
->st_ex_ctime
= src
->st_ctimespec
;
705 dst
->st_ex_btime
= src
->st_birthtimespec
;
706 dst
->st_ex_blksize
= src
->st_blksize
;
707 dst
->st_ex_blocks
= src
->st_blocks
;
709 dst
->st_ex_flags
= src
->st_flags
;
711 dst
->vfs_private
= src
->st_snapid
;
714 int onefs_sys_stat(const char *fname
, SMB_STRUCT_STAT
*sbuf
)
717 struct stat onefs_sbuf
;
719 ret
= stat(fname
, &onefs_sbuf
);
722 /* we always want directories to appear zero size */
723 if (S_ISDIR(onefs_sbuf
.st_mode
)) {
724 onefs_sbuf
.st_size
= 0;
726 init_stat_ex_from_onefs_stat(sbuf
, &onefs_sbuf
);
731 int onefs_sys_fstat(int fd
, SMB_STRUCT_STAT
*sbuf
)
734 struct stat onefs_sbuf
;
736 ret
= fstat(fd
, &onefs_sbuf
);
739 /* we always want directories to appear zero size */
740 if (S_ISDIR(onefs_sbuf
.st_mode
)) {
741 onefs_sbuf
.st_size
= 0;
743 init_stat_ex_from_onefs_stat(sbuf
, &onefs_sbuf
);
748 int onefs_sys_fstat_at(int base_fd
, const char *fname
, SMB_STRUCT_STAT
*sbuf
,
752 struct stat onefs_sbuf
;
754 ret
= enc_fstatat(base_fd
, fname
, ENC_DEFAULT
, &onefs_sbuf
, flags
);
757 /* we always want directories to appear zero size */
758 if (S_ISDIR(onefs_sbuf
.st_mode
)) {
759 onefs_sbuf
.st_size
= 0;
761 init_stat_ex_from_onefs_stat(sbuf
, &onefs_sbuf
);
766 int onefs_sys_lstat(const char *fname
, SMB_STRUCT_STAT
*sbuf
)
769 struct stat onefs_sbuf
;
771 ret
= lstat(fname
, &onefs_sbuf
);
774 /* we always want directories to appear zero size */
775 if (S_ISDIR(onefs_sbuf
.st_mode
)) {
776 onefs_sbuf
.st_size
= 0;
778 init_stat_ex_from_onefs_stat(sbuf
, &onefs_sbuf
);