akaros/kern/src/ns/sysfile.c
<<
>>
Prefs
   1/* Copyright © 1994-1999 Lucent Technologies Inc.  All rights reserved.
   2 * Portions Copyright © 1997-1999 Vita Nuova Limited
   3 * Portions Copyright © 2000-2007 Vita Nuova Holdings Limited
   4 *                                (www.vitanuova.com)
   5 * Revisions Copyright © 2000-2007 Lucent Technologies Inc. and others
   6 *
   7 * Modified for the Akaros operating system:
   8 * Copyright (c) 2013-2014 The Regents of the University of California
   9 * Copyright (c) 2013-2015 Google Inc.
  10 *
  11 * Permission is hereby granted, free of charge, to any person obtaining a copy
  12 * of this software and associated documentation files (the "Software"), to deal
  13 * in the Software without restriction, including without limitation the rights
  14 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  15 * copies of the Software, and to permit persons to whom the Software is
  16 * furnished to do so, subject to the following conditions:
  17 *
  18 * The above copyright notice and this permission notice shall be included in
  19 * all copies or substantial portions of the Software.
  20 *
  21 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  22 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  23 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
  24 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  25 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  26 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  27 * SOFTWARE. */
  28
  29#include <slab.h>
  30#include <kmalloc.h>
  31#include <kref.h>
  32#include <string.h>
  33#include <stdio.h>
  34#include <assert.h>
  35#include <error.h>
  36#include <cpio.h>
  37#include <pmap.h>
  38#include <smp.h>
  39#include <net/ip.h>
  40#include <rcu.h>
  41
  42/* TODO: these sizes are hokey.  DIRSIZE is used in chandirstat, and it looks
  43 * like it's the size of a common-case stat. */
  44enum {
  45        DIRSIZE = STAT_FIX_LEN_AK + 32 * STAT_NR_STRINGS_AK,
  46
  47        /* should handle the largest reasonable directory entry */
  48        DIRREADLIM = 2048,
  49
  50        /* Just read a lot. Memory is cheap, lots of bandwidth, and RPCs are
  51         * very expensive. At the same time, let's not yet exceed a common
  52         * MSIZE. */
  53        DIRREADSIZE = 8192,
  54};
  55
  56int newfd(struct chan *c, int low_fd, int oflags, bool must_use_low)
  57{
  58        int ret = insert_obj_fdt(&current->open_files, c, low_fd,
  59                                 oflags & O_CLOEXEC ? FD_CLOEXEC : 0,
  60                                 must_use_low);
  61        if (ret >= 0)
  62                cclose(c);
  63        return ret;
  64}
  65
  66struct chan *fdtochan(struct fd_table *fdt, int fd, int mode, int chkmnt,
  67                      int iref)
  68{
  69        struct chan *c;
  70
  71        c = lookup_fd(fdt, fd, iref);
  72        if (!c) {
  73                /* We lost the info about why there was a problem (we used to
  74                 * track file group closed too, can add that in later). */
  75                error(EBADF, ERROR_FIXME);
  76        }
  77        if (chkmnt && (c->flag & CMSG)) {
  78                if (iref)
  79                        cclose(c);
  80                error(EBADF, ERROR_FIXME);
  81        }
  82        if (mode < 0)
  83                return c;
  84        if ((mode & c->mode) != mode) {
  85                if (iref)
  86                        cclose(c);
  87                error(EBADF,
  88                      "FD access mode failure: chan mode 0x%x, wanted 0x%x (opened with 0 instead of O_READ?)",
  89                      c->mode, mode);
  90        }
  91        return c;
  92}
  93
  94long kchanio(void *vc, void *buf, int n, int mode)
  95{
  96        ERRSTACK(1);
  97        int r;
  98        struct chan *c;
  99
 100        c = vc;
 101        if (waserror()) {
 102                poperror();
 103                return -1;
 104        }
 105
 106        if (mode == O_READ)
 107                r = devtab[c->type].read(c, buf, n, c->offset);
 108        else if (mode == O_WRITE)
 109                r = devtab[c->type].write(c, buf, n, c->offset);
 110        else
 111                error(ENOSYS, "kchanio: use only O_READ xor O_WRITE");
 112
 113        spin_lock(&c->lock);
 114        c->offset += r;
 115        spin_unlock(&c->lock);
 116        poperror();
 117        return r;
 118}
 119
 120int openmode(uint32_t omode)
 121{
 122/* GIANT WARNING: if this ever throws, ipopen (and probably many others) will
 123 * screw up refcnts of Qctl, err, data, etc */
 124#if 0
 125        /* this is the old plan9 style.  i think they want to turn exec into
 126         * read, and strip off anything higher, and just return the RD/WR style
 127         * bits.  not stuff like ORCLOSE.  the lack of OEXCL might be a bug on
 128         * their part (it's the only one of their non-RW-related flags that
 129         * isn't masked out).
 130         *
 131         * Note that we no longer convert OEXEC/O_EXEC to O_READ, and instead
 132         * return just the O_ACCMODE bits. */
 133        if (o >= (OTRUNC | OCEXEC | ORCLOSE | OEXEC))
 134                error(EINVAL, ERROR_FIXME);
 135        o &= ~(OTRUNC | OCEXEC | ORCLOSE);
 136        if (o > OEXEC)
 137                error(EINVAL, ERROR_FIXME);
 138        if (o == OEXEC)
 139                return OREAD;
 140        return o;
 141#endif
 142        /* no error checking (we have a shitload of flags anyway), and we return
 143         * the basic access modes (RD/WR/ETC) */
 144        return omode & O_ACCMODE;
 145}
 146
 147void fdclose(struct fd_table *fdt, int fd)
 148{
 149        close_fd(fdt, fd);
 150}
 151
 152static void set_dot(struct proc *p, struct chan *c)
 153{
 154        c = atomic_swap_ptr((void**)&p->dot, c);
 155        synchronize_rcu();
 156        cclose(c);
 157}
 158
 159/* Note namec() happens in the namespace of the caller. */
 160int syschdir(struct proc *target, char *path)
 161{
 162        ERRSTACK(1);
 163        struct chan *c;
 164
 165        if (waserror()) {
 166                poperror();
 167                return -1;
 168        }
 169        c = namec(path, Atodir, 0, 0, NULL);
 170        poperror();
 171        set_dot(target, c);
 172        return 0;
 173}
 174
 175/* Note fdtochan() happens with the FDs of the caller. */
 176int sysfchdir(struct proc *target, int fd)
 177{
 178        ERRSTACK(1);
 179        struct chan *c;
 180
 181        if (waserror()) {
 182                poperror();
 183                return -1;
 184        }
 185        c = fdtochan(&current->open_files, fd, -1, 0, 1);
 186        poperror();
 187
 188        /* This is a little hokey.  Ideally, we'd only allow O_PATH fds to be
 189         * fchdir'd.  Linux/POSIX lets you do arbitrary FDs.  Luckily, we stored
 190         * the name when we walked (__namec_from), so we should be able to
 191         * recreate the chan.  Using namec() with channame() is a more
 192         * heavy-weight cclone(), but also might have issues if the chan has
 193         * since been removed or the namespace is otherwise different from when
 194         * the original fd/chan was first created. */
 195        if (c->flag & O_PATH) {
 196                set_dot(target, c);
 197                return 0;
 198        }
 199        if (waserror()) {
 200                cclose(c);
 201                poperror();
 202                return -1;
 203        }
 204        syschdir(target, channame(c));
 205        cclose(c);
 206        poperror();
 207
 208        return 0;
 209}
 210
 211int sysclose(int fd)
 212{
 213        ERRSTACK(1);
 214        struct fd_table *fdt = &current->open_files;
 215
 216        if (waserror()) {
 217                poperror();
 218                return -1;
 219        }
 220        /*
 221         * Take no reference on the chan because we don't really need the
 222         * data structure, and are calling fdtochan only for error checks.
 223         * fdclose takes care of processes racing through here.
 224         */
 225        fdtochan(fdt, fd, -1, 0, 0);
 226        fdclose(fdt, fd);
 227        poperror();
 228        return 0;
 229}
 230
 231int syscreate(char *path, int mode, uint32_t perm)
 232{
 233        ERRSTACK(2);
 234        int fd;
 235        struct chan *c;
 236
 237        if (waserror()) {
 238                poperror();
 239                return -1;
 240        }
 241
 242        openmode(mode & ~O_EXCL);       /* error check only; OEXCL okay here */
 243        c = namec(path, Acreate, mode, perm, NULL);
 244        if (waserror()) {
 245                cclose(c);
 246                nexterror();
 247        }
 248        /* 9ns mode is the O_FLAGS and perm is glibc mode */
 249        fd = newfd(c, 0, mode, FALSE);
 250        if (fd < 0)
 251                error(-fd, ERROR_FIXME);
 252        poperror();
 253
 254        poperror();
 255        return fd;
 256}
 257
 258int sysdup(int old, int low_fd, bool must_use_low)
 259{
 260        ERRSTACK(1);
 261        int fd;
 262        struct chan *c;
 263
 264        if (waserror()) {
 265                poperror();
 266                return -1;
 267        }
 268        c = fdtochan(&current->open_files, old, -1, 0, 1);
 269        if (c->qid.type & QTAUTH) {
 270                cclose(c);
 271                error(EPERM, ERROR_FIXME);
 272        }
 273        fd = newfd(c, low_fd, 0, must_use_low);
 274        if (fd < 0) {
 275                cclose(c);
 276                error(-fd, ERROR_FIXME);
 277        }
 278        poperror();
 279        return fd;
 280}
 281
 282/* Could pass in the fdt instead of the proc, but we used to need the to_proc
 283 * for now so we can claim a VFS FD.  Careful, we don't close the old chan. */
 284int sys_dup_to(struct proc *from_proc, unsigned int from_fd,
 285               struct proc *to_proc, unsigned int to_fd)
 286{
 287        ERRSTACK(1);
 288        int ret;
 289        struct chan *c;
 290
 291        if (waserror()) {
 292                poperror();
 293                return -1;
 294        }
 295        c = fdtochan(&from_proc->open_files, from_fd, -1, 0, 1);
 296        if (c->qid.type & QTAUTH) {
 297                cclose(c);
 298                error(EPERM, ERROR_FIXME);
 299        }
 300        ret = insert_obj_fdt(&to_proc->open_files, c, to_fd, 0, TRUE);
 301        /* drop the ref from fdtochan.  if insert succeeded, there is one other
 302         * ref stored in the FDT */
 303        cclose(c);
 304        if (ret < 0)
 305                error(EFAIL, "Can't insert FD %d into FDG", to_fd);
 306        poperror();
 307        return 0;
 308}
 309
 310char *sysgetcwd(void)
 311{
 312        char *s = NULL;
 313        struct chan *dot;
 314
 315        rcu_read_lock();
 316        dot = rcu_dereference(current->dot);
 317        kref_get(&dot->ref, 1);
 318        rcu_read_unlock();
 319        if (dot->name)
 320                kstrdup(&s, dot->name->s);
 321        cclose(dot);
 322        return s;
 323}
 324
 325int sysfauth(int fd, char *aname)
 326{
 327        ERRSTACK(2);
 328        struct chan *c, *ac;
 329
 330        if (waserror()) {
 331                poperror();
 332                return -1;
 333        }
 334
 335        validname(aname, 0);
 336        c = fdtochan(&current->open_files, fd, O_RDWR, 0, 1);
 337        if (waserror()) {
 338                cclose(c);
 339                nexterror();
 340        }
 341
 342        ac = mntauth(c, aname);
 343
 344        /* at this point ac is responsible for keeping c alive */
 345        poperror();     /* c */
 346        cclose(c);
 347
 348        if (waserror()) {
 349                cclose(ac);
 350                nexterror();
 351        }
 352
 353        fd = newfd(ac, 0, 0, FALSE);
 354        if (fd < 0)
 355                error(-fd, ERROR_FIXME);
 356        poperror();     /* ac */
 357
 358        poperror();
 359
 360        return fd;
 361}
 362
 363int sysfversion(int fd, unsigned int msize, char *vers, unsigned int arglen)
 364{
 365        ERRSTACK(2);
 366        int m;
 367        struct chan *c;
 368
 369        if (waserror()) {
 370                poperror();
 371                return -1;
 372        }
 373
 374        /* check there's a NUL in the version string */
 375        if (arglen == 0 || memchr(vers, 0, arglen) == 0)
 376                error(EINVAL, ERROR_FIXME);
 377
 378        c = fdtochan(&current->open_files, fd, O_RDWR, 0, 1);
 379        if (waserror()) {
 380                cclose(c);
 381                nexterror();
 382        }
 383
 384        m = mntversion(c, vers, msize, arglen);
 385
 386        poperror();
 387        cclose(c);
 388
 389        poperror();
 390        return m;
 391}
 392
 393int sysfwstat(int fd, uint8_t * buf, int n)
 394{
 395        ERRSTACK(2);
 396        struct chan *c;
 397
 398        if (waserror()) {
 399                poperror();
 400                return -1;
 401        }
 402
 403        validstat(buf, n, 0);
 404        c = fdtochan(&current->open_files, fd, -1, 1, 1);
 405        if (waserror()) {
 406                cclose(c);
 407                nexterror();
 408        }
 409        n = devtab[c->type].wstat(c, buf, n);
 410        poperror();
 411        cclose(c);
 412
 413        poperror();
 414        return n;
 415}
 416
 417long bindmount(struct chan *c, char *old, int flag, char *spec)
 418{
 419        ERRSTACK(1);
 420        int ret;
 421        struct chan *c1;
 422
 423        if (flag > MMASK || (flag & MORDER) == (MBEFORE | MAFTER))
 424                error(EINVAL, ERROR_FIXME);
 425
 426        c1 = namec(old, Amount, 0, 0, NULL);
 427        if (waserror()) {
 428                cclose(c1);
 429                nexterror();
 430        }
 431        ret = cmount(c, c1, flag, spec);
 432
 433        poperror();
 434        cclose(c1);
 435        return ret;
 436}
 437
 438int sysbind(char *new, char *old, int flags)
 439{
 440        ERRSTACK(2);
 441        long r;
 442        struct chan *c0;
 443
 444        if (waserror()) {
 445                poperror();
 446                return -1;
 447        }
 448
 449        c0 = namec(new, Abind, 0, 0, NULL);
 450        if (waserror()) {
 451                cclose(c0);
 452                nexterror();
 453        }
 454        r = bindmount(c0, old, flags, "");
 455        poperror();
 456        cclose(c0);
 457
 458        poperror();
 459        return r;
 460}
 461
 462int syssymlink(char *new_path, char *old_path)
 463{
 464        ERRSTACK(1);
 465        struct chan *c;
 466
 467        if (waserror()) {
 468                poperror();
 469                return -1;
 470        }
 471        validname(old_path, true);
 472        c = namec(new_path, Acreate, O_EXCL,
 473                  DMSYMLINK | S_IRWXU | S_IRWXG | S_IRWXO, old_path);
 474        cclose(c);
 475        poperror();
 476        return 0;
 477}
 478
 479int sysmount(int fd, int afd, char *old, int flags, char *spec)
 480{
 481        ERRSTACK(1);
 482        long r;
 483        volatile struct {
 484                struct chan *c;
 485        } c0;
 486        volatile struct {
 487                struct chan *c;
 488        } bc;
 489        volatile struct {
 490                struct chan *c;
 491        } ac;
 492        struct mntparam mntparam;
 493
 494        ac.c = NULL;
 495        bc.c = NULL;
 496        c0.c = NULL;
 497        if (waserror()) {
 498                cclose(ac.c);
 499                cclose(bc.c);
 500                cclose(c0.c);
 501                poperror();
 502                return -1;
 503        }
 504        bc.c = fdtochan(&current->open_files, fd, O_RDWR, 0, 1);
 505        if (afd >= 0)
 506                ac.c = fdtochan(&current->open_files, afd, O_RDWR, 0, 1);
 507        mntparam.chan = bc.c;
 508        mntparam.authchan = ac.c;
 509        mntparam.spec = spec;
 510        c0.c = devtab[devno("mnt", 0)].attach((char *)&mntparam);
 511        if (flags & MCACHE)
 512                c0.c = devtab[devno("gtfs", 0)].attach((char*)c0.c);
 513        r = bindmount(c0.c, old, flags, spec);
 514        poperror();
 515        cclose(ac.c);
 516        cclose(bc.c);
 517        cclose(c0.c);
 518
 519        return r;
 520}
 521
 522int sysunmount(char *src_path, char *onto_path)
 523{
 524        ERRSTACK(1);
 525        volatile struct {
 526                struct chan *c;
 527        } cmount;
 528        volatile struct {
 529                struct chan *c;
 530        } cmounted;
 531
 532        cmount.c = NULL;
 533        cmounted.c = NULL;
 534        if (waserror()) {
 535                cclose(cmount.c);
 536                cclose(cmounted.c);
 537                poperror();
 538                return -1;
 539        }
 540
 541        cmount.c = namec(onto_path, Amount, 0, 0, NULL);
 542        if (src_path != NULL && src_path[0] != '\0') {
 543                /*
 544                 * This has to be namec(..., Aopen, ...) because
 545                 * if arg[0] is something like /srv/cs or /fd/0,
 546                 * opening it is the only way to get at the real
 547                 * Chan underneath.
 548                 */
 549                cmounted.c = namec(src_path, Aopen, O_READ, 0, NULL);
 550        }
 551
 552        cunmount(cmount.c, cmounted.c);
 553        poperror();
 554        cclose(cmount.c);
 555        cclose(cmounted.c);
 556        return 0;
 557}
 558
 559int sysopenat(int fromfd, char *path, int vfs_flags, int perm)
 560{
 561        ERRSTACK(1);
 562        int fd;
 563        struct chan *c = NULL, *from;
 564        int open_or_create = Aopen;
 565
 566        /* O_EXCL must be O_CREATE (checked in syscall.c); we can skip the Aopen
 567         * call.  Note namec(Acreate) checks O_EXCL internally. */
 568        if (vfs_flags & O_EXCL)
 569                open_or_create = Acreate;
 570        if (waserror()) {
 571                if (open_or_create == Aopen && vfs_flags & O_CREATE
 572                    && get_errno() == ENOENT) {
 573                        open_or_create = Acreate;
 574                        /* Don't poperror - we're keeping ourselves at the
 575                         * current waserror() depth.  Returns thrice! */
 576                        goto retry;
 577                }
 578                cclose(c);
 579                poperror();
 580                return -1;
 581        }
 582retry:
 583        openmode(vfs_flags & ~O_EXCL);  /* error check only; O_EXCL okay here */
 584        if ((path[0] == '/') || (fromfd == AT_FDCWD)) {
 585                c = namec(path, open_or_create, vfs_flags, perm, NULL);
 586        } else {
 587                /* We don't cclose from.  namec_from will convert it to the new
 588                 * chan during the walk process (c).  It'll probably close from
 589                 * internally, and give us something new for c.  On error,
 590                 * namec_from will cclose from. */
 591                from = fdtochan(&current->open_files, fromfd, -1, FALSE, TRUE);
 592                if (!(from->flag & O_PATH)) {
 593                        /* This is the only error path where we need to close
 594                         * from.  namec_from will close from for us, regardless
 595                         * of whether or not it fails. */
 596                        cclose(from);
 597                        error(EINVAL, "Cannot openat from a non-O_PATH FD");
 598                }
 599                c = namec_from(from, path, open_or_create, vfs_flags, perm,
 600                               NULL);
 601        }
 602        /* Devices should catch this, but just in case, we'll catch it. */
 603        if ((c->qid.type & QTSYMLINK) && (vfs_flags & O_NOFOLLOW))
 604                error(ELOOP, "no-follow open of a symlink");
 605        fd = newfd(c, 0, vfs_flags, FALSE);
 606        if (fd < 0)
 607                error(-fd, ERROR_FIXME);
 608        poperror();
 609        return fd;
 610}
 611
 612int sysopen(char *path, int vfs_flags)
 613{
 614        return sysopenat(AT_FDCWD, path, vfs_flags, 0);
 615}
 616
 617long unionread(struct chan *c, void *va, long n)
 618{
 619        ERRSTACK(1);
 620        int i;
 621        long nr;
 622        struct mhead *m;
 623        struct mount *mount;
 624
 625        qlock(&c->umqlock);
 626        m = c->umh;
 627        rlock(&m->lock);
 628        mount = m->mount;
 629        /* bring mount in sync with c->uri and c->umc */
 630        for (i = 0; mount != NULL && i < c->uri; i++)
 631                mount = mount->next;
 632
 633        nr = 0;
 634        while (mount != NULL) {
 635                /* Error causes component of union to be skipped */
 636                if (mount->to) {
 637                        /* normally we want to discard the error, but for our
 638                         * ghetto kdirent hack, we need to repeat unionread if
 639                         * we saw a ENODATA */
 640                        if (waserror()) {
 641                                if (get_errno() == ENODATA) {
 642                                        runlock(&m->lock);
 643                                        qunlock(&c->umqlock);
 644                                        nexterror();
 645                                }
 646                                /* poperror done below for either branch */
 647                        } else {
 648                                if (c->umc == NULL) {
 649                                        c->umc = cclone(mount->to);
 650                                        c->umc =
 651                                            devtab[c->umc->type].open(c->umc,
 652                                                                      O_READ);
 653                                }
 654
 655                                nr = devtab[c->umc->type].read(c->umc, va, n,
 656                                                               c->umc->offset);
 657                                if (nr < 0)
 658                                        nr = 0; /* dev.c can return -1 */
 659                                c->umc->offset += nr;
 660                        }
 661                        poperror();     /* pop regardless */
 662                }
 663                if (nr > 0)
 664                        break;
 665
 666                /* Advance to next element */
 667                c->uri++;
 668                if (c->umc) {
 669                        cclose(c->umc);
 670                        c->umc = NULL;
 671                }
 672                mount = mount->next;
 673        }
 674        runlock(&m->lock);
 675        qunlock(&c->umqlock);
 676        return nr;
 677}
 678
 679static void unionrewind(struct chan *c)
 680{
 681        qlock(&c->umqlock);
 682        c->uri = 0;
 683        if (c->umc) {
 684                cclose(c->umc);
 685                c->umc = NULL;
 686        }
 687        qunlock(&c->umqlock);
 688}
 689
 690static long rread(int fd, void *va, long n, int64_t * offp)
 691{
 692        ERRSTACK(3);
 693        int dir;
 694        struct chan *c;
 695        int64_t off;
 696
 697        /* dirty dirent hack */
 698        void *real_va = va;
 699
 700        if (waserror()) {
 701                poperror();
 702                return -1;
 703        }
 704
 705        c = fdtochan(&current->open_files, fd, O_READ, 1, 1);
 706        if (waserror()) {
 707                cclose(c);
 708                nexterror();
 709        }
 710
 711        if (n < 0)
 712                error(EINVAL, ERROR_FIXME);
 713
 714        dir = c->qid.type & QTDIR;
 715
 716        /* kdirent hack: userspace is expecting kdirents, but all of 9ns
 717         * produces Ms.  Just save up what we don't use and append the
 718         * new stuff later. Allocate DIRREADSIZE bytes for that purpose.
 719         */
 720        if (dir) {
 721                int amt;
 722
 723                if (n < sizeof(struct kdirent))
 724                        error(EINVAL, "readdir needs to read at least %d",
 725                              sizeof(struct kdirent));
 726                if (!c->buf) {
 727                        c->buf = kmalloc(DIRREADSIZE, MEM_WAIT);
 728                        c->bufused = 0;
 729                }
 730                /* Attempt to extract an M, in case there was some already */
 731                amt = convM2kdirent(c->buf, c->bufused, real_va, 0);
 732                if (amt) {
 733                        c->bufused -= amt;
 734                        memmove(c->buf, c->buf + amt, c->bufused);
 735                        n = sizeof(struct kdirent);
 736                        goto out;
 737                }
 738                /* debugging */
 739                if (waserror()) {
 740                        printk("Well, sysread of a dir sucks.%s \n",
 741                               current_errstr());
 742                        nexterror();
 743                }
 744                va = c->buf + c->bufused;
 745                n = DIRREADSIZE - c->bufused;
 746        }
 747
 748        /* this is the normal plan9 read */
 749        if (dir && c->umh)
 750                n = unionread(c, va, n);
 751        else {
 752                if (offp == NULL) {
 753                        spin_lock(&c->lock); /* lock for int64_t assignment */
 754                        off = c->offset;
 755                        spin_unlock(&c->lock);
 756                } else
 757                        off = *offp;
 758                if (off < 0)
 759                        error(EINVAL, ERROR_FIXME);
 760                if ((off64_t)off + (size_t)n < (off64_t)off)
 761                        error(EINVAL, "bad offset %p + count %p", off, n);
 762                if (off == 0) {
 763                        if (offp == NULL) {
 764                                spin_lock(&c->lock);
 765                                c->offset = 0;
 766                                c->dri = 0;
 767                                spin_unlock(&c->lock);
 768                        }
 769                        unionrewind(c);
 770                }
 771                if (! c->ateof) {
 772                        n = devtab[c->type].read(c, va, n, off);
 773                        if (n == 0 && dir)
 774                                c->ateof = 1;
 775                } else {
 776                        n = 0;
 777                }
 778                spin_lock(&c->lock);
 779                c->offset += n;
 780                spin_unlock(&c->lock);
 781        }
 782
 783        /* dirty kdirent hack */
 784        if (dir) {
 785                int amt;
 786                c->bufused = c->bufused + n;
 787                /* extract an M from the front, then shift the remainder back */
 788                amt = convM2kdirent(c->buf, c->bufused, real_va, 0);
 789                c->bufused -= amt;
 790                memmove(c->buf, c->buf + amt, c->bufused);
 791                n = amt ? sizeof(struct kdirent) : 0;
 792                poperror();     /* matching our debugging waserror */
 793        }
 794
 795out:
 796        poperror();
 797        cclose(c);
 798
 799        poperror();
 800        return n;
 801}
 802
 803/* Reads exactly n bytes from chan c, starting at its offset.  Can block, but if
 804 * we get 0 back too soon (EOF or error), then we'll error out with ENODATA.
 805 * That might need a little work - if there was a previous error, then we
 806 * clobbered it and only know ENODATA but not why we completed early. */
 807void read_exactly_n(struct chan *c, void *vp, long n)
 808{
 809        char *p;
 810        long nn;
 811        int total = 0, want = n;
 812
 813        p = vp;
 814        while (n > 0) {
 815                nn = devtab[c->type].read(c, p, n, c->offset);
 816                printd("readn: Got %d@%lld\n", nn, c->offset);
 817                if (nn == 0)
 818                        error(ENODATA, "wanted %d, got %d", want, total);
 819                spin_lock(&c->lock);
 820                c->offset += nn;
 821                spin_unlock(&c->lock);
 822                p += nn;
 823                n -= nn;
 824                total += nn;
 825        }
 826}
 827
 828long sysread(int fd, void *va, long n)
 829{
 830        return rread(fd, va, n, NULL);
 831}
 832
 833long syspread(int fd, void *va, long n, int64_t off)
 834{
 835        return rread(fd, va, n, &off);
 836}
 837
 838int sysremove(char *path)
 839{
 840        ERRSTACK(2);
 841        struct chan *c;
 842
 843        if (waserror()) {
 844                poperror();
 845                return -1;
 846        }
 847
 848        c = namec(path, Aremove, 0, 0, NULL);
 849        if (waserror()) {
 850                c->type = -1;   /* see below */
 851                cclose(c);
 852                nexterror();
 853        }
 854        devtab[c->type].remove(c);
 855        /*
 856         * Remove clunks the fid, but we need to recover the Chan
 857         * so fake it up.  -1 aborts the dev's close.
 858         */
 859        c->type = -1;
 860        poperror();
 861        cclose(c);
 862
 863        poperror();
 864        return 0;
 865}
 866
 867int sysrename(char *from_path, char *to_path)
 868{
 869        ERRSTACK(1);
 870        struct chan *volatile renamee = NULL;
 871        struct chan *parent_chan;
 872
 873        if (waserror()) {
 874                cclose(renamee);
 875                poperror();
 876                return -1;
 877        }
 878        renamee = namec(from_path, Aremove, 0, 0, NULL);
 879        /* We might need to support wstat for 'short' rename (intra-directory,
 880         * with no slashes).  Til then, we can just go with EXDEV. */
 881        if (!devtab[renamee->type].rename)
 882                error(EXDEV, "device does not support rename");
 883        parent_chan = namec(to_path, Arename, 0, 0, (char*)renamee);
 884        /* When we're done, renamee still points to the file, but it's in the
 885         * new location.  Its cname is still the old location, similar to
 886         * remove.  If anyone cares, we can change it.  parent_chan still points
 887         * to the parent - it didn't get moved like create does.  Though it does
 888         * have the name of the new location.  If we want, we can hand that to
 889         * renamee.  It's a moot point, since they are both getting closed. */
 890        cclose(renamee);
 891        cclose(parent_chan);
 892        poperror();
 893        return 0;
 894}
 895
 896int64_t sysseek(int fd, int64_t off, int whence)
 897{
 898        ERRSTACK(2);
 899        struct dir *dir;
 900        struct chan *c;
 901
 902        if (waserror()) {
 903                poperror();
 904                return -1;
 905        }
 906
 907        c = fdtochan(&current->open_files, fd, -1, 1, 1);
 908        if (waserror()) {
 909                cclose(c);
 910                nexterror();
 911        }
 912        switch (whence) {
 913        case 0:
 914                if (c->qid.type & QTDIR) {
 915                        if (off != 0)
 916                                error(EISDIR, ERROR_FIXME);
 917                        unionrewind(c);
 918                } else if (off < 0)
 919                        error(EINVAL, ERROR_FIXME);
 920                spin_lock(&c->lock);    /* lock for int64_t assignment */
 921                c->offset = off;
 922                spin_unlock(&c->lock);
 923                break;
 924
 925        case 1:
 926                if (c->qid.type & QTDIR)
 927                        error(EISDIR, ERROR_FIXME);
 928                spin_lock(&c->lock);    /* lock for read/write update */
 929                off += c->offset;
 930                if (off < 0) {
 931                        spin_unlock(&c->lock);
 932                        error(EINVAL, ERROR_FIXME);
 933                }
 934                c->offset = off;
 935                spin_unlock(&c->lock);
 936                break;
 937
 938        case 2:
 939                if (c->qid.type & QTDIR)
 940                        error(EISDIR, ERROR_FIXME);
 941                dir = chandirstat(c);
 942                if (dir == NULL)
 943                        error(EFAIL, "internal error: stat error in seek");
 944                off += dir->length;
 945                kfree(dir);
 946                if (off < 0)
 947                        error(EINVAL, ERROR_FIXME);
 948                spin_lock(&c->lock);    /* lock for read/write update */
 949                c->offset = off;
 950                spin_unlock(&c->lock);
 951                break;
 952
 953        default:
 954                error(EINVAL, ERROR_FIXME);
 955                break;
 956        }
 957        poperror();
 958        c->dri = 0;
 959        cclose(c);
 960        poperror();
 961        return off;
 962}
 963
 964void validstat(uint8_t * s, int n, int slashok)
 965{
 966
 967        int m;
 968        char buf[64];
 969
 970        statcheck(s, n);
 971        /* verify that name entry is acceptable */
 972        s += STAT_FIX_LEN_9P - STAT_NR_STRINGS_9P * BIT16SZ;
 973        /*
 974         * s now points at count for first string.
 975         * if it's too long, let the server decide; this is
 976         * only for his protection anyway. otherwise
 977         * we'd have to allocate and waserror.
 978         */
 979        m = GBIT16(s);
 980        s += BIT16SZ;
 981        if (m + 1 > sizeof buf) {
 982                return;
 983        }
 984        memmove(buf, s, m);
 985        buf[m] = '\0';
 986        /* name could be '/' */
 987        if (strcmp(buf, "/") != 0)
 988                validname(buf, slashok);
 989}
 990
 991int sysfstat(int fd, uint8_t *buf, int n)
 992{
 993        ERRSTACK(2);
 994        struct chan *c;
 995
 996        if (waserror()) {
 997                poperror();
 998                return -1;
 999        }
1000
1001        c = fdtochan(&current->open_files, fd, -1, 0, 1);
1002        if (waserror()) {
1003                cclose(c);
1004                nexterror();
1005        }
1006        devtab[c->type].stat(c, buf, n);
1007
1008        poperror();
1009        cclose(c);
1010
1011        poperror();
1012        return n;
1013}
1014
1015int sysfstatakaros(int fd, struct kstat *ks)
1016{
1017
1018        int n = 4096;
1019        uint8_t *buf;
1020
1021        buf = kmalloc(n, MEM_WAIT);
1022        n = sysfstat(fd, buf, n);
1023        if (n > 0) {
1024                convM2kstat(buf, n, ks);
1025                n = 0;
1026        }
1027        kfree(buf);
1028        return n;
1029}
1030
1031static int __stat(char *path, uint8_t *buf, int n, int flags)
1032{
1033        ERRSTACK(2);
1034        struct chan *c;
1035
1036        if (waserror()) {
1037                poperror();
1038                return -1;
1039        }
1040
1041        c = namec(path, Aaccess, flags, 0, NULL);
1042        if (waserror()) {
1043                cclose(c);
1044                nexterror();
1045        }
1046        devtab[c->type].stat(c, buf, n);
1047        poperror();
1048        cclose(c);
1049
1050        poperror();
1051
1052        return n;
1053}
1054
1055int sysstat(char *path, uint8_t *buf, int n)
1056{
1057        return __stat(path, buf, n, 0);
1058}
1059
1060int syslstat(char *path, uint8_t *buf, int n)
1061{
1062        return __stat(path, buf, n, O_NOFOLLOW);
1063}
1064
1065int sysstatakaros(char *path, struct kstat *ks, int flags)
1066{
1067
1068        int n = 4096;
1069        uint8_t *buf;
1070
1071        buf = kmalloc(n, MEM_WAIT);
1072        n = __stat(path, buf, n, flags);
1073        if (n > 0) {
1074                convM2kstat(buf, n, ks);
1075                n = 0;
1076        }
1077        kfree(buf);
1078        return n;
1079}
1080
1081static long rwrite(int fd, void *va, long n, int64_t * offp)
1082{
1083        ERRSTACK(3);
1084        struct chan *c;
1085        struct dir *dir;
1086        int64_t off;
1087        long m;
1088
1089        if (waserror()) {
1090                poperror();
1091                return -1;
1092        }
1093        c = fdtochan(&current->open_files, fd, O_WRITE, 1, 1);
1094        if (waserror()) {
1095                cclose(c);
1096                nexterror();
1097        }
1098        if (c->qid.type & QTDIR)
1099                error(EISDIR, ERROR_FIXME);
1100
1101        if (n < 0)
1102                error(EINVAL, ERROR_FIXME);
1103
1104        if (offp == NULL) {
1105                /* append changes the offset to the end, and even if we fail
1106                 * later, this change will persist */
1107                if (c->flag & O_APPEND) {
1108                        dir = chandirstat(c);
1109                        if (!dir)
1110                                error(EFAIL, "stat error in append write");
1111                        /* legacy lock for int64 assignment */
1112                        spin_lock(&c->lock);
1113                        c->offset = dir->length;
1114                        spin_unlock(&c->lock);
1115                        kfree(dir);
1116                }
1117                spin_lock(&c->lock);
1118                off = c->offset;
1119                c->offset += n;
1120                spin_unlock(&c->lock);
1121        } else
1122                off = *offp;
1123
1124        if (waserror()) {
1125                if (offp == NULL) {
1126                        spin_lock(&c->lock);
1127                        c->offset -= n;
1128                        spin_unlock(&c->lock);
1129                }
1130                nexterror();
1131        }
1132        if (off < 0)
1133                error(EINVAL, ERROR_FIXME);
1134        if ((off64_t)off + (size_t)n < (off64_t)off)
1135                error(EINVAL, "bad offset %p + count %p", off, n);
1136        m = devtab[c->type].write(c, va, n, off);
1137        poperror();
1138
1139        if (offp == NULL && m < n) {
1140                spin_lock(&c->lock);
1141                c->offset -= n - m;
1142                spin_unlock(&c->lock);
1143        }
1144
1145        poperror();
1146        cclose(c);
1147
1148        poperror();
1149        return m;
1150}
1151
1152long syswrite(int fd, void *va, long n)
1153{
1154        return rwrite(fd, va, n, NULL);
1155}
1156
1157long syspwrite(int fd, void *va, long n, int64_t off)
1158{
1159        return rwrite(fd, va, n, &off);
1160}
1161
1162int syswstat(char *path, uint8_t * buf, int n)
1163{
1164        ERRSTACK(2);
1165        struct chan *c;
1166
1167        if (waserror()) {
1168                poperror();
1169                return -1;
1170        }
1171
1172        validstat(buf, n, 0);
1173        c = namec(path, Aaccess, 0, 0, NULL);
1174        if (waserror()) {
1175                cclose(c);
1176                nexterror();
1177        }
1178        n = devtab[c->type].wstat(c, buf, n);
1179        poperror();
1180        cclose(c);
1181
1182        poperror();
1183        return n;
1184}
1185
1186struct dir *chandirstat(struct chan *c)
1187{
1188        ERRSTACK(1);
1189        struct dir *d;
1190        uint8_t *buf;
1191        int n, nd, i;
1192
1193        nd = DIRSIZE;
1194        for (i = 0; i < 2; i++) {       /* should work by the second try */
1195                d = kzmalloc(sizeof(struct dir) + nd, MEM_WAIT);
1196                buf = (uint8_t *) & d[1];
1197                if (waserror()) {
1198                        kfree(d);
1199                        poperror();
1200                        return NULL;
1201                }
1202                n = devtab[c->type].stat(c, buf, nd);
1203                poperror();
1204                if (n < BIT16SZ) {
1205                        kfree(d);
1206                        return NULL;
1207                }
1208                /* size needed to store whole stat buffer including count */
1209                nd = GBIT16((uint8_t *) buf) + BIT16SZ;
1210                if (nd <= n) {
1211                        convM2D(buf, n, d, (char *)&d[1]);
1212                        return d;
1213                }
1214                /* else sizeof(Dir)+nd is plenty */
1215                kfree(d);
1216        }
1217        return NULL;
1218
1219}
1220
1221static struct dir *__dir_stat(char *name, int flags)
1222{
1223        ERRSTACK(2);
1224        struct chan *c;
1225        struct dir *d;
1226
1227        if (waserror()) {
1228                poperror();
1229                return NULL;
1230        }
1231
1232        c = namec(name, Aaccess, flags, 0, NULL);
1233        if (waserror()) {
1234                cclose(c);
1235                nexterror();
1236        }
1237        d = chandirstat(c);
1238        poperror();
1239        cclose(c);
1240
1241        poperror();
1242        return d;
1243}
1244
1245struct dir *sysdirstat(char *name)
1246{
1247        return __dir_stat(name, 0);
1248}
1249
1250struct dir *sysdirlstat(char *name)
1251{
1252        return __dir_stat(name, O_NOFOLLOW);
1253}
1254
1255struct dir *sysdirfstat(int fd)
1256{
1257        ERRSTACK(2);
1258        struct chan *c;
1259        struct dir *d;
1260
1261        if (waserror()) {
1262                poperror();
1263                return NULL;
1264        }
1265
1266        c = fdtochan(&current->open_files, fd, -1, 0, 1);
1267        if (waserror()) {
1268                cclose(c);
1269                nexterror();
1270        }
1271        d = chandirstat(c);
1272        poperror();
1273        cclose(c);
1274
1275        poperror();
1276        return d;
1277}
1278
1279int sysdirwstat(char *name, struct dir *dir)
1280{
1281
1282        uint8_t *buf;
1283        int r;
1284
1285        r = sizeD2M(dir);
1286        buf = kzmalloc(r, MEM_WAIT);
1287        convD2M(dir, buf, r);
1288        r = syswstat(name, buf, r);
1289        kfree(buf);
1290        return r < 0 ? r : 0;
1291}
1292
1293int sysdirfwstat(int fd, struct dir *dir)
1294{
1295
1296        uint8_t *buf;
1297        int r;
1298
1299        r = sizeD2M(dir);
1300        buf = kzmalloc(r, MEM_WAIT);
1301        convD2M(dir, buf, r);
1302        r = sysfwstat(fd, buf, r);
1303        kfree(buf);
1304        return r < 0 ? r : 0;
1305}
1306
1307static long dirpackage(uint8_t * buf, long ts, struct kdirent **d)
1308{
1309
1310        char *s;
1311        long ss, i, n, nn, m = 0;
1312
1313        *d = NULL;
1314        if (ts <= 0) {
1315                return ts;
1316        }
1317
1318        /*
1319         * first find number of all stats, check they look like stats, & size
1320         * all associated strings
1321         */
1322        ss = 0;
1323        n = 0;
1324        for (i = 0; i < ts; i += m) {
1325                m = BIT16SZ + GBIT16(&buf[i]);
1326                statcheck(&buf[i], m);
1327                ss += m;
1328                n++;
1329        }
1330
1331        *d = kzmalloc(n * sizeof(**d) + ss, 0);
1332        if (*d == NULL)
1333                error(ENOMEM, ERROR_FIXME);
1334
1335        /*
1336         * then convert all buffers
1337         */
1338        s = (char *)*d + n * sizeof(**d);
1339        nn = 0;
1340        for (i = 0; i < ts; i += m) {
1341                m = BIT16SZ + GBIT16((uint8_t *) & buf[i]);
1342                /* Note 's' is ignored by convM2kdirent */
1343                if (nn >= n || /*convM2D */ convM2kdirent(&buf[i], m, *d + nn,
1344                                                          s) != m) {
1345                        kfree(*d);
1346                        *d = NULL;
1347                        error(EFAIL, "bad directory entry");
1348                }
1349                nn++;
1350                s += m;
1351        }
1352
1353        return nn;
1354}
1355
1356long sysdirread(int fd, struct kdirent **d)
1357{
1358        ERRSTACK(2);
1359        uint8_t *buf;
1360        long ts;
1361
1362        *d = NULL;
1363        if (waserror()) {
1364                poperror();
1365                return -1;
1366        }
1367        buf = kzmalloc(DIRREADLIM, 0);
1368        if (buf == NULL)
1369                error(ENOMEM, ERROR_FIXME);
1370        if (waserror()) {
1371                kfree(buf);
1372                nexterror();
1373        }
1374        ts = sysread(fd, buf, DIRREADLIM);
1375        if (ts >= 0)
1376                ts = dirpackage(buf, ts, d);
1377        poperror();
1378        kfree(buf);
1379        poperror();
1380        return ts;
1381}
1382
1383int sysiounit(int fd)
1384{
1385        ERRSTACK(1);
1386        struct chan *c;
1387        int n;
1388
1389        c = fdtochan(&current->open_files, fd, -1, 0, 1);
1390        if (waserror()) {
1391                cclose(c);
1392                poperror();
1393                return 0;       /* n.b. */
1394        }
1395        n = c->iounit;
1396        poperror();
1397        cclose(c);
1398        return n;
1399}
1400
1401void print_chaninfo(struct chan *c)
1402{
1403
1404        char buf[128] = { 0 };
1405        bool has_dev = c->type != -1;
1406        bool has_chaninfo = has_dev && devtab[c->type].chaninfo;
1407
1408        print_lock();
1409        printk("Chan flags: %p, pathname: %s, ref: %d, Dev: %s, Devinfo: %s",
1410                   c->flag,
1411                   c->name ? c->name->s : "no cname",
1412                   kref_refcnt(&c->ref),
1413                   has_dev ? devtab[c->type].name : "no dev",
1414                   has_chaninfo ? devtab[c->type].chaninfo(c, buf, sizeof(buf))
1415                                : "");
1416        if (!has_chaninfo)
1417                printk("qid.path: %p\n", c->qid.path);
1418        printk("\n");
1419        print_unlock();
1420}
1421
1422/* TODO: 9ns ns inheritance flags: Shared, copied, or empty.  The old fgrp is
1423 * managed by the fd_table, which is handled outside this function.  We share
1424 * the pgrp. */
1425int plan9setup(struct proc *new_proc, struct proc *parent, int flags)
1426{
1427
1428        struct chan *new_dot;
1429
1430        ERRSTACK(1);
1431        if (waserror()) {
1432                printk("plan9setup failed, %s\n", current_errstr());
1433                poperror();
1434                return -1;
1435        }
1436        if (!parent) {
1437                /* We are probably spawned by the kernel directly, and have no
1438                 * parent to inherit from. */
1439                new_proc->pgrp = newpgrp();
1440                new_proc->slash = namec("#kfs", Atodir, 0, 0, NULL);
1441                if (!new_proc->slash)
1442                        panic("no kfs device");
1443                /* Want the name to be "/" instead of "#kfs" */
1444                cnameclose(new_proc->slash->name);
1445                new_proc->slash->name = newcname("/");
1446                new_proc->dot = cclone(new_proc->slash);
1447                poperror();
1448                return 0;
1449        }
1450        /* Shared semantics */
1451        kref_get(&parent->pgrp->ref, 1);
1452        new_proc->pgrp = parent->pgrp;
1453        /* copy semantics on / and . (doesn't make a lot of sense in akaros
1454         * o/w). */
1455        /* / should never disappear while we hold a ref to parent */
1456        chan_incref(parent->slash);
1457        new_proc->slash = parent->slash;
1458
1459        rcu_read_lock();
1460        new_dot = rcu_dereference(parent->dot);
1461        kref_get(&new_dot->ref, 1);
1462        rcu_read_unlock();
1463        new_proc->dot = new_dot;
1464
1465        poperror();
1466        return 0;
1467}
1468
1469/* Open flags, create modes, access types, file flags, and all that...
1470 *
1471 * there are a bunch of things here:
1472 *              1) file creation flags (e.g. O_TRUNC)
1473 *              2) file status flags (e.g. O_APPEND)
1474 *              3) file open modes (e.g. O_RDWR)
1475 *              4) file descriptor flags (e.g. CLOEXEC)
1476 *              5) file creation mode (e.g. S_IRWXU)
1477 * the 1-4 are passed in via open's vfs_flags, and the 5 via mode only when
1478 * O_CREATE is set.
1479 *
1480 * file creation flags (1) only matter when creating, but aren't permanent.
1481 * O_EXCL, O_DIRECTORY, O_TRUNC, etc.
1482 *
1483 * file status flags (2) are per struct file/chan.  stuff like O_APPEND,
1484 * O_ASYNC, etc.  we convert those to an internal flag bit and store in c->flags
1485 *
1486 * the open mode (3) matters for a given FD/chan (chan->mode), and should be
1487 * stored in the chan. (c->mode) stuff like O_RDONLY.
1488 *
1489 * the file descriptor flags (4) clearly are in the FD.  note that the same
1490 * file/chan can be opened by two different FDs, with different flags.  the only
1491 * one anyone uses is CLOEXEC.  while exec may not last long in akaros, i can
1492 * imagine similar "never pass to children" flags/meanings.
1493 *
1494 * the file creation mode (5) matters for the device's permissions; given this,
1495 * it should be stored in the device/inode.  ACLs fall under this category.
1496 *
1497 * finally, only certain categories can be edited afterwards: file status flags
1498 * (2), FD flags (4), and file permissions (5). */
1499int fd_getfl(int fd)
1500{
1501        ERRSTACK(1);
1502        struct chan *c;
1503        int ret;
1504
1505        if (waserror()) {
1506                poperror();
1507                return -1;
1508        }
1509        c = fdtochan(&current->open_files, fd, -1, 0, 1);
1510
1511        ret = c->mode;
1512        ret |= c->flag & CEXTERNAL_FLAGS;
1513
1514        cclose(c);
1515        poperror();
1516        return ret;
1517}
1518
1519static bool cexternal_flags_differ(int set1, int set2, int flags)
1520{
1521        flags &= CEXTERNAL_FLAGS;
1522        return (set1 & flags) ^ (set2 & flags);
1523}
1524
1525static int chan_setfl(struct chan *c, int flags)
1526{
1527        int ret;
1528
1529        if (cexternal_flags_differ(flags, c->flag, O_CLOEXEC)) {
1530                /* TODO: The whole CCEXEC / O_CLOEXEC on 9ns needs work */
1531                error(EINVAL, "can't toggle O_CLOEXEC with setfl");
1532        }
1533        if (cexternal_flags_differ(flags, c->flag, O_REMCLO))
1534                error(EINVAL, "can't toggle O_REMCLO with setfl");
1535        if (cexternal_flags_differ(flags, c->flag, O_PATH))
1536                error(EINVAL, "can't toggle O_PATH with setfl");
1537        ret = devtab[c->type].chan_ctl(c, CCTL_SET_FL, flags & CEXTERNAL_FLAGS,
1538                                       0, 0, 0);
1539        c->flag = (c->flag & ~CEXTERNAL_FLAGS) | (flags & CEXTERNAL_FLAGS);
1540        return ret;
1541}
1542
1543int fd_chan_ctl(int fd, int cmd, unsigned long arg1, unsigned long arg2,
1544                unsigned long arg3, unsigned long arg4)
1545{
1546        ERRSTACK(2);
1547        struct chan *c;
1548        int ret;
1549
1550        if (waserror()) {
1551                poperror();
1552                return -1;
1553        }
1554        c = fdtochan(&current->open_files, fd, -1, 0, 1);
1555        if (waserror()) {
1556                cclose(c);
1557                nexterror();
1558        }
1559
1560        if (!devtab[c->type].chan_ctl)
1561                error(EINVAL, "%s has no chan_ctl, can't %d", chan_dev_name(c),
1562                      cmd);
1563
1564        /* Some commands require 9ns support in addition to the device ctl. */
1565        switch (cmd) {
1566        case CCTL_SET_FL:
1567                ret = chan_setfl(c, arg1);
1568                break;
1569        default:
1570                ret = devtab[c->type].chan_ctl(c, cmd, arg1, arg2, arg3, arg4);
1571                break;
1572        }
1573
1574        poperror();
1575        cclose(c);
1576        poperror();
1577        return ret;
1578}
1579
1580ssize_t kread_file(struct file_or_chan *file, void *buf, size_t sz)
1581{
1582        /* TODO: (KFOP) (VFS kernel read/writes need to be from a ktask) */
1583        uintptr_t old_ret = switch_to_ktask();
1584        off64_t dummy = 0;
1585        ssize_t cpy_amt = foc_read(file, buf, sz, dummy);
1586
1587        switch_back_from_ktask(old_ret);
1588        return cpy_amt;
1589}
1590
1591/* Reads the contents of an entire file into a buffer, returning that buffer.
1592 * On error, prints something useful and returns 0 */
1593void *kread_whole_file(struct file_or_chan *file)
1594{
1595        size_t size;
1596        void *contents;
1597        ssize_t cpy_amt;
1598
1599        size = foc_get_len(file);
1600        contents = kmalloc(size, MEM_WAIT);
1601        cpy_amt = kread_file(file, contents, size);
1602        if (cpy_amt < 0) {
1603                printk("Error %d reading file %s\n", get_errno(),
1604                       foc_to_name(file));
1605                kfree(contents);
1606                return 0;
1607        }
1608        if (cpy_amt != size) {
1609                printk("Read %d, needed %d for file %s\n", cpy_amt, size,
1610                       foc_to_name(file));
1611                kfree(contents);
1612                return 0;
1613        }
1614        return contents;
1615}
1616
1617/* Process-related File management functions */
1618
1619/* Given any FD, get the appropriate object, 0 o/w. Set incref if you want a
1620 * reference count (which is a 9ns thing, you can't use the pointer if you
1621 * didn't incref). */
1622void *lookup_fd(struct fd_table *fdt, int fd, bool incref)
1623{
1624        void *retval = 0;
1625
1626        if (fd < 0)
1627                return 0;
1628        spin_lock(&fdt->lock);
1629        if (fdt->closed) {
1630                spin_unlock(&fdt->lock);
1631                return 0;
1632        }
1633        if (fd < fdt->max_fdset) {
1634                if (GET_BITMASK_BIT(fdt->open_fds->fds_bits, fd)) {
1635                        /* while max_files and max_fdset might not line up, we
1636                         * should never have a valid fdset higher than files */
1637                        assert(fd < fdt->max_files);
1638                        retval = fdt->fd[fd].fd_chan;
1639                        if (incref)
1640                                chan_incref((struct chan*)retval);
1641                }
1642        }
1643        spin_unlock(&fdt->lock);
1644        return retval;
1645}
1646
1647/* Grow the vfs fd set */
1648static int grow_fd_set(struct fd_table *open_files)
1649{
1650        int n;
1651        struct file_desc *nfd, *ofd;
1652
1653        /* Only update open_fds once. If currently pointing to open_fds_init,
1654         * then update it to point to a newly allocated fd_set with space for
1655         * NR_FILE_DESC_MAX */
1656        if (open_files->open_fds == (struct fd_set*)&open_files->open_fds_init)
1657        {
1658                open_files->open_fds = kzmalloc(sizeof(struct fd_set), 0);
1659                memmove(open_files->open_fds, &open_files->open_fds_init,
1660                        sizeof(struct small_fd_set));
1661        }
1662
1663        /* Grow the open_files->fd array in increments of NR_OPEN_FILES_DEFAULT
1664         */
1665        n = open_files->max_files + NR_OPEN_FILES_DEFAULT;
1666        if (n > NR_FILE_DESC_MAX)
1667                return -EMFILE;
1668        nfd = kzmalloc(n * sizeof(struct file_desc), 0);
1669        if (nfd == NULL)
1670                return -ENOMEM;
1671
1672        /* Move the old array on top of the new one */
1673        ofd = open_files->fd;
1674        memmove(nfd, ofd, open_files->max_files * sizeof(struct file_desc));
1675
1676        /* Update the array and the maxes for both max_files and max_fdset */
1677        open_files->fd = nfd;
1678        open_files->max_files = n;
1679        open_files->max_fdset = n;
1680
1681        /* Only free the old one if it wasn't pointing to open_files->fd_array*/
1682        if (ofd != open_files->fd_array)
1683                kfree(ofd);
1684        return 0;
1685}
1686
1687/* Free the vfs fd set if necessary */
1688static void free_fd_set(struct fd_table *open_files)
1689{
1690        void *free_me;
1691
1692        if (open_files->open_fds != (struct fd_set*)&open_files->open_fds_init)
1693        {
1694                assert(open_files->fd != open_files->fd_array);
1695                /* need to reset the pointers to the internal addrs, in case we
1696                 * take a look while debugging.  0 them out, since they have old
1697                 * data.  our current versions should all be closed. */
1698                memset(&open_files->open_fds_init, 0,
1699                       sizeof(struct small_fd_set));
1700                memset(&open_files->fd_array, 0, sizeof(open_files->fd_array));
1701
1702                free_me = open_files->open_fds;
1703                open_files->open_fds =
1704                        (struct fd_set*)&open_files->open_fds_init;
1705                kfree(free_me);
1706
1707                free_me = open_files->fd;
1708                open_files->fd = open_files->fd_array;
1709                kfree(free_me);
1710        }
1711}
1712
1713/* If FD is in the group, remove it, decref it, and return TRUE. */
1714bool close_fd(struct fd_table *fdt, int fd)
1715{
1716        struct chan *chan = 0;
1717        struct fd_tap *tap = 0;
1718        bool ret = FALSE;
1719
1720        if (fd < 0)
1721                return FALSE;
1722        spin_lock(&fdt->lock);
1723        if (fd < fdt->max_fdset) {
1724                if (GET_BITMASK_BIT(fdt->open_fds->fds_bits, fd)) {
1725                        /* while max_files and max_fdset might not line up, we
1726                         * should never have a valid fdset higher than files */
1727                        assert(fd < fdt->max_files);
1728                        chan = fdt->fd[fd].fd_chan;
1729                        tap = fdt->fd[fd].fd_tap;
1730                        fdt->fd[fd].fd_chan = 0;
1731                        fdt->fd[fd].fd_tap = 0;
1732                        CLR_BITMASK_BIT(fdt->open_fds->fds_bits, fd);
1733                        if (fd < fdt->hint_min_fd)
1734                                fdt->hint_min_fd = fd;
1735                        ret = TRUE;
1736                }
1737        }
1738        spin_unlock(&fdt->lock);
1739        /* Need to decref/cclose outside of the lock; they could sleep */
1740        cclose(chan);
1741        if (tap)
1742                kref_put(&tap->kref);
1743        return ret;
1744}
1745
1746static int __get_fd(struct fd_table *open_files, int low_fd, bool must_use_low)
1747{
1748        int slot = -1;
1749        int error;
1750        bool update_hint = TRUE;
1751
1752        if ((low_fd < 0) || (low_fd > NR_FILE_DESC_MAX))
1753                return -EINVAL;
1754        if (open_files->closed)
1755                return -EINVAL; /* won't matter, they are dying */
1756        if (must_use_low
1757            && GET_BITMASK_BIT(open_files->open_fds->fds_bits, low_fd))
1758                return -ENFILE;
1759        if (low_fd > open_files->hint_min_fd)
1760                update_hint = FALSE;
1761        else
1762                low_fd = open_files->hint_min_fd;
1763        /* Loop until we have a valid slot (we grow the fd_array at the bottom
1764         * of the loop if we haven't found a slot in the current array */
1765        while (slot == -1) {
1766                for (low_fd; low_fd < open_files->max_fdset; low_fd++) {
1767                        if (GET_BITMASK_BIT(open_files->open_fds->fds_bits,
1768                                            low_fd))
1769                                continue;
1770                        slot = low_fd;
1771                        SET_BITMASK_BIT(open_files->open_fds->fds_bits, slot);
1772                        assert(slot < open_files->max_files &&
1773                               open_files->fd[slot].fd_chan == 0);
1774                        /* We know slot >= hint, since we started with hint */
1775                        if (update_hint)
1776                                open_files->hint_min_fd = slot + 1;
1777                        break;
1778                }
1779                if (slot == -1) {
1780                        if ((error = grow_fd_set(open_files)))
1781                                return error;
1782                }
1783        }
1784        return slot;
1785}
1786
1787/* Insert a file or chan (obj, chosen by vfs) into the fd group with fd_flags.
1788 * If must_use_low, then we have to insert at FD = low_fd.  o/w we start looking
1789 * for empty slots at low_fd. */
1790int insert_obj_fdt(struct fd_table *fdt, void *obj, int low_fd, int fd_flags,
1791                   bool must_use_low)
1792{
1793        int slot;
1794
1795        spin_lock(&fdt->lock);
1796        slot = __get_fd(fdt, low_fd, must_use_low);
1797        if (slot < 0) {
1798                spin_unlock(&fdt->lock);
1799                return slot;
1800        }
1801        assert(slot < fdt->max_files &&
1802               fdt->fd[slot].fd_chan == 0);
1803        chan_incref((struct chan*)obj);
1804        fdt->fd[slot].fd_chan = obj;
1805        fdt->fd[slot].fd_flags = fd_flags;
1806        spin_unlock(&fdt->lock);
1807        return slot;
1808}
1809
1810/* Closes all open files.  Mostly just a "put" for all files.  If cloexec, it
1811 * will only close the FDs with FD_CLOEXEC (opened with O_CLOEXEC or fcntld).
1812 *
1813 * Notes on concurrency:
1814 * - Can't hold spinlocks while we call cclose, since it might sleep eventually.
1815 * - We're called from proc_destroy, so we could have concurrent openers trying
1816 *   to add to the group (other syscalls), hence the "closed" flag.
1817 * - dot and slash chans are dealt with in proc_free.  its difficult to close
1818 *   and zero those with concurrent syscalls, since those are a source of krefs.
1819 * - Once we lock and set closed, no further additions can happen.  To simplify
1820 *   our closes, we also allow multiple calls to this func (though that should
1821 *   never happen with the current code). */
1822void close_fdt(struct fd_table *fdt, bool cloexec)
1823{
1824        struct chan *chan;
1825        struct file_desc *to_close;
1826        int idx = 0;
1827
1828        to_close = kzmalloc(sizeof(struct file_desc) * fdt->max_files,
1829                            MEM_WAIT);
1830        spin_lock(&fdt->lock);
1831        if (fdt->closed) {
1832                spin_unlock(&fdt->lock);
1833                kfree(to_close);
1834                return;
1835        }
1836        for (int i = 0; i < fdt->max_fdset; i++) {
1837                if (GET_BITMASK_BIT(fdt->open_fds->fds_bits, i)) {
1838                        /* while max_files and max_fdset might not line up, we
1839                         * should never have a valid fdset higher than files */
1840                        assert(i < fdt->max_files);
1841                        if (cloexec && !(fdt->fd[i].fd_flags & FD_CLOEXEC))
1842                                continue;
1843                        chan = fdt->fd[i].fd_chan;
1844                        to_close[idx].fd_tap = fdt->fd[i].fd_tap;
1845                        fdt->fd[i].fd_tap = 0;
1846                        fdt->fd[i].fd_chan = 0;
1847                        to_close[idx++].fd_chan = chan;
1848                        CLR_BITMASK_BIT(fdt->open_fds->fds_bits, i);
1849                }
1850        }
1851        /* it's just a hint, we can build back up from being 0 */
1852        fdt->hint_min_fd = 0;
1853        if (!cloexec) {
1854                free_fd_set(fdt);
1855                fdt->closed = TRUE;
1856        }
1857        spin_unlock(&fdt->lock);
1858        /* We go through some hoops to close/decref outside the lock.  Nice for
1859         * not holding the lock for a while; critical in case the decref/cclose
1860         * sleeps (it can) */
1861        for (int i = 0; i < idx; i++) {
1862                cclose(to_close[i].fd_chan);
1863                if (to_close[i].fd_tap)
1864                        kref_put(&to_close[i].fd_tap->kref);
1865        }
1866        kfree(to_close);
1867}
1868
1869/* Inserts all of the files from src into dst, used by sys_fork(). */
1870void clone_fdt(struct fd_table *src, struct fd_table *dst)
1871{
1872        struct chan *chan;
1873        int ret;
1874
1875        spin_lock(&src->lock);
1876        if (src->closed) {
1877                spin_unlock(&src->lock);
1878                return;
1879        }
1880        spin_lock(&dst->lock);
1881        if (dst->closed) {
1882                warn("Destination closed before it opened");
1883                spin_unlock(&dst->lock);
1884                spin_unlock(&src->lock);
1885                return;
1886        }
1887        while (src->max_files > dst->max_files) {
1888                ret = grow_fd_set(dst);
1889                if (ret < 0) {
1890                        set_error(-ret, "Failed to grow for a clone_fdt");
1891                        spin_unlock(&dst->lock);
1892                        spin_unlock(&src->lock);
1893                        return;
1894                }
1895        }
1896        for (int i = 0; i < src->max_fdset; i++) {
1897                if (GET_BITMASK_BIT(src->open_fds->fds_bits, i)) {
1898                        /* while max_files and max_fdset might not line up, we
1899                         * should never have a valid fdset higher than files */
1900                        assert(i < src->max_files);
1901                        chan = src->fd[i].fd_chan;
1902                        assert(i < dst->max_files && dst->fd[i].fd_chan == 0);
1903                        SET_BITMASK_BIT(dst->open_fds->fds_bits, i);
1904                        dst->fd[i].fd_chan = chan;
1905                        chan_incref(chan);
1906                }
1907        }
1908        dst->hint_min_fd = src->hint_min_fd;
1909        spin_unlock(&dst->lock);
1910        spin_unlock(&src->lock);
1911}
1912
1913int fd_get_fd_flags(struct fd_table *fdt, int fd)
1914{
1915        int ret = -1;
1916
1917        if (fd < 0)
1918                return -1;
1919        spin_lock(&fdt->lock);
1920        if (fdt->closed) {
1921                spin_unlock(&fdt->lock);
1922                return -1;
1923        }
1924        if ((fd < fdt->max_fdset)
1925            && GET_BITMASK_BIT(fdt->open_fds->fds_bits, fd))
1926                ret = fdt->fd[fd].fd_flags;
1927        spin_unlock(&fdt->lock);
1928        if (ret == -1)
1929                set_error(EBADF, "FD was not open");
1930        return ret;
1931}
1932
1933int fd_set_fd_flags(struct fd_table *fdt, int fd, int new_fl)
1934{
1935        int ret = -1;
1936
1937        if (fd < 0)
1938                return -1;
1939        spin_lock(&fdt->lock);
1940        if (fdt->closed) {
1941                spin_unlock(&fdt->lock);
1942                return -1;
1943        }
1944        if ((fd < fdt->max_fdset)
1945            && GET_BITMASK_BIT(fdt->open_fds->fds_bits, fd))
1946                fdt->fd[fd].fd_flags = new_fl;
1947        spin_unlock(&fdt->lock);
1948        if (ret == -1)
1949                set_error(EBADF, "FD was not open");
1950        return ret;
1951}
1952