akaros/kern/drivers/dev/gtfs.c
<<
>>
Prefs
   1/* Copyright (c) 2018 Google Inc
   2 * Barret Rhoden <brho@cs.berkeley.edu>
   3 * See LICENSE for details.
   4 *
   5 * #gtfs, generic tree file system frontend that hooks to a backend 9p device
   6 */
   7
   8#include <slab.h>
   9#include <kmalloc.h>
  10#include <kref.h>
  11#include <string.h>
  12#include <stdio.h>
  13#include <assert.h>
  14#include <error.h>
  15#include <pmap.h>
  16#include <smp.h>
  17#include <tree_file.h>
  18
  19struct dev gtfs_devtab;
  20
  21static char *devname(void)
  22{
  23        return gtfs_devtab.name;
  24}
  25
  26struct gtfs {
  27        struct tree_filesystem          tfs;
  28        struct kref                     users;
  29};
  30
  31/* Blob hanging off the fs_file->priv.  The backend chans are only accessed,
  32 * (changed or used) with the corresponding fs_file qlock held.  That's the
  33 * primary use of the qlock - we might be able to avoid qlocking with increfs
  34 * and atomics or spinlocks, but be careful of be_length.  Qlocking doesn't
  35 * matter much yet since #mnt serializes.
  36 *
  37 * The walk chan is never opened - it's basically just the walked fid, from
  38 * which we can do other walks or get the I/O chans.  The read and write chans
  39 * are opened on demand and closed periodically.  We open them initially on
  40 * open/create in case we are unable to open them (e.g. unwritable).  Better to
  41 * find out early than during a long writeback.
  42 *
  43 * The mnt server might complain about having too many open fids.  We can run a
  44 * ktask that periodically closes the be_chans on any LRU'd files.
  45 *
  46 * be_{length,mode,mtime} should be what the remote server thinks they are -
  47 * especially for length and mode.  The invariant is that e.g. the file's length
  48 * == be_length, and the qlock protects that invariant.  We don't care as much
  49 * about mtime, since some 9p servers just change that on their own.
  50 *
  51 * Also note that you can't trust be_length for directories.  You'll often get
  52 * 4096 or 0, depending on the 9p server you're talking to. */
  53struct gtfs_priv {
  54        struct chan                     *be_walk;       /* never opened */
  55        struct chan                     *be_read;
  56        struct chan                     *be_write;
  57        uint64_t                        be_length;
  58        uint32_t                        be_mode;
  59        struct timespec                 be_mtime;
  60        bool                            was_removed;
  61};
  62
  63static inline struct gtfs_priv *fsf_to_gtfs_priv(struct fs_file *f)
  64{
  65        return f->priv;
  66}
  67
  68static inline struct gtfs_priv *tf_to_gtfs_priv(struct tree_file *tf)
  69{
  70        return fsf_to_gtfs_priv(&tf->file);
  71}
  72
  73/* Helper.  Clones the chan (walks to itself) and then opens with omode. */
  74static struct chan *cclone_and_open(struct chan *c, int omode)
  75{
  76        ERRSTACK(1);
  77        struct chan *new;
  78
  79        new = cclone(c);
  80        if (waserror()) {
  81                cclose(new);
  82                nexterror();
  83        }
  84        new = devtab[new->type].open(new, omode);
  85        poperror();
  86        return new;
  87}
  88
  89/* Send a wstat with the contents of dir for the file. */
  90static void wstat_dir(struct fs_file *f, struct dir *dir)
  91{
  92        ERRSTACK(1);
  93        struct gtfs_priv *gp = fsf_to_gtfs_priv(f);
  94        size_t sz;
  95        uint8_t *buf;
  96
  97        sz = sizeD2M(dir);
  98        buf = kzmalloc(sz, MEM_WAIT);
  99        convD2M(dir, buf, sz);
 100        if (waserror()) {
 101                kfree(buf);
 102                nexterror();
 103        }
 104        devtab[gp->be_walk->type].wstat(gp->be_walk, buf, sz);
 105        kfree(buf);
 106        poperror();
 107}
 108
 109/* Note we only track and thus change the following:
 110 * - length
 111 * - mode
 112 * - mtime (second granularity)
 113 * If we support chown, we'll have to do something else there.  See
 114 * fs_file_copy_from_dir(). */
 115static void sync_metadata(struct fs_file *f)
 116{
 117        ERRSTACK(1);
 118        struct gtfs_priv *gp = fsf_to_gtfs_priv(f);
 119        struct dir dir;
 120        bool send_it = false;
 121
 122        qlock(&f->qlock);
 123        init_empty_dir(&dir);
 124        if (f->dir.length != gp->be_length) {
 125                dir.length = f->dir.length;
 126                send_it = true;
 127        }
 128        if (f->dir.mode != gp->be_mode) {
 129                dir.mode = f->dir.mode;
 130                send_it = true;
 131        }
 132        if (f->dir.mtime.tv_sec != gp->be_mtime.tv_sec) {
 133                /* ninep's UFS server assumes you set both atime and mtime */
 134                dir.atime.tv_sec = f->dir.atime.tv_sec;
 135                dir.atime.tv_nsec = f->dir.atime.tv_nsec;
 136                dir.mtime.tv_sec = f->dir.mtime.tv_sec;
 137                dir.mtime.tv_nsec = f->dir.mtime.tv_nsec;
 138                send_it = true;
 139        }
 140        if (!send_it) {
 141                qunlock(&f->qlock);
 142                return;
 143        }
 144        if (waserror()) {
 145                qunlock(&f->qlock);
 146                nexterror();
 147        }
 148        wstat_dir(f, &dir);
 149        /* We set these after the wstat succeeds.  If we set them earlier, we'd
 150         * have to roll back.  Remember the invariant: the be_values match the
 151         * backend's file's values.  We should be able to stat be_walk and check
 152         * these (though the 9p server might muck with atime/mtime). */
 153        if (f->dir.length != gp->be_length)
 154                gp->be_length = f->dir.length;
 155        if (f->dir.mode != gp->be_mode)
 156                gp->be_mode = f->dir.mode;
 157        if (f->dir.mtime.tv_sec != gp->be_mtime.tv_sec)
 158                gp->be_mtime = f->dir.mtime;
 159        qunlock(&f->qlock);
 160        poperror();
 161}
 162
 163/* Can throw on error, currently from sync_metadata. */
 164static void writeback_file(struct fs_file *f)
 165{
 166        sync_metadata(f);
 167        /* This is a lockless peak.  Once a file is dirtied, we never undirty
 168         * it.  To do so, we need the file qlock (not a big deal, though that
 169         * may replace the PM qlock), and we still need to handle/scan mmaps.
 170         * Specifically, we only dirty when an mmap attaches (PROT_WRITE and
 171         * MAP_SHARED), but we don't know if an existing mapping has caused more
 172         * dirtying (an mmap can re-dirty then detach before our next
 173         * writeback).  That usually requires a scan.  This is all an
 174         * optimization to avoid scanning the entire PM's pages for whether or
 175         * not they are dirty.
 176         *
 177         * Also, our writeback pm op grabs the file's qlock.  So be careful;
 178         * though we could use another qlock, since we're mostly protecting
 179         * backend state. */
 180        if (qid_is_file(f->dir.qid) && (f->flags & FSF_DIRTY))
 181                pm_writeback_pages(f->pm);
 182}
 183
 184static void purge_cb(struct tree_file *tf)
 185{
 186        ERRSTACK(1)
 187
 188        /* discard error, and keep on going if we can. */
 189        if (!waserror())
 190                writeback_file(&tf->file);
 191        poperror();
 192}
 193
 194static void gtfs_release(struct kref *kref)
 195{
 196        struct gtfs *gtfs = container_of(kref, struct gtfs, users);
 197
 198        tfs_frontend_purge(&gtfs->tfs, purge_cb);
 199        /* this is the ref from attach */
 200        assert(kref_refcnt(&gtfs->tfs.root->kref) == 1);
 201        tf_kref_put(gtfs->tfs.root);
 202        /* ensures __tf_free() happens before tfs_destroy */
 203        rcu_barrier();
 204        tfs_destroy(&gtfs->tfs);
 205        kfree(gtfs);
 206}
 207
 208static struct gtfs *chan_to_gtfs(struct chan *c)
 209{
 210        struct tree_file *tf = chan_to_tree_file(c);
 211
 212        return (struct gtfs*)(tf->tfs);
 213}
 214
 215static void incref_gtfs_chan(struct chan *c)
 216{
 217        kref_get(&chan_to_gtfs(c)->users, 1);
 218}
 219
 220static void decref_gtfs_chan(struct chan *c)
 221{
 222        kref_put(&chan_to_gtfs(c)->users);
 223}
 224
 225static struct walkqid *gtfs_walk(struct chan *c, struct chan *nc, char **name,
 226                                 unsigned int nname)
 227{
 228        struct walkqid *wq;
 229
 230        wq = tree_chan_walk(c, nc, name, nname);
 231        if (wq && wq->clone && (wq->clone != c))
 232                incref_gtfs_chan(wq->clone);
 233        return wq;
 234}
 235
 236/* Given an omode, make sure the be chans are set up */
 237static void setup_be_chans(struct chan *c, int omode)
 238{
 239        ERRSTACK(1);
 240        struct tree_file *tf = chan_to_tree_file(c);
 241        struct fs_file *f = &tf->file;
 242        struct gtfs_priv *gp = fsf_to_gtfs_priv(f);
 243
 244        qlock(&f->qlock);
 245        if (waserror()) {
 246                qunlock(&f->qlock);
 247                nexterror();
 248        }
 249        /* Readers and writers both need be_read.  With fs files you can't have
 250         * a writable-only file, since we need to load the page into the page
 251         * cache, which is a readpage. */
 252        if (!gp->be_read)
 253                gp->be_read = cclone_and_open(gp->be_walk, O_READ);
 254        if (!gp->be_write && (omode & O_WRITE))
 255                gp->be_write = cclone_and_open(gp->be_walk, O_WRITE);
 256        qunlock(&f->qlock);
 257        poperror();
 258}
 259
 260static struct chan *gtfs_open(struct chan *c, int omode)
 261{
 262        /* truncate can happen before we setup the be_chans.  if we need those,
 263         * we can swap the order */
 264        c = tree_chan_open(c, omode);
 265        setup_be_chans(c, omode);
 266        return c;
 267}
 268
 269
 270static void gtfs_create(struct chan *c, char *name, int omode, uint32_t perm,
 271                        char *ext)
 272{
 273        tree_chan_create(c, name, omode, perm, ext);
 274        /* We have to setup *after* create, since it moves the chan from the
 275         * parent to the new file. */
 276        setup_be_chans(c, omode);
 277}
 278
 279static void gtfs_close(struct chan *c)
 280{
 281        tree_chan_close(c);
 282        decref_gtfs_chan(c);
 283}
 284
 285static void gtfs_remove(struct chan *c)
 286{
 287        ERRSTACK(1);
 288        struct gtfs *gtfs = chan_to_gtfs(c);
 289
 290        if (waserror()) {
 291                /* Same old pain-in-the-ass for remove */
 292                kref_put(&gtfs->users);
 293                nexterror();
 294        }
 295        tree_chan_remove(c);
 296        kref_put(&gtfs->users);
 297        poperror();
 298}
 299
 300static size_t gtfs_wstat(struct chan *c, uint8_t *m_buf, size_t m_buf_sz)
 301{
 302        size_t ret;
 303
 304        ret = tree_chan_wstat(c, m_buf, m_buf_sz);
 305        /* Tell the backend so that any metadata changes take effect
 306         * immediately.  Consider chmod +w.  We need to tell the 9p server so
 307         * that it will allow future accesses. */
 308        sync_metadata(&chan_to_tree_file(c)->file);
 309        return ret;
 310}
 311
 312/* Caller holds the file's qlock. */
 313static size_t __gtfs_fsf_read(struct fs_file *f, void *ubuf, size_t n,
 314                              off64_t off)
 315{
 316        struct gtfs_priv *gp = fsf_to_gtfs_priv(f);
 317
 318        if (!gp->be_read)
 319                gp->be_read = cclone_and_open(gp->be_walk, O_READ);
 320        return devtab[gp->be_read->type].read(gp->be_read, ubuf, n, off);
 321}
 322
 323/* Reads a file from its backend chan */
 324static size_t gtfs_fsf_read(struct fs_file *f, void *ubuf, size_t n,
 325                            off64_t off)
 326{
 327        ERRSTACK(1);
 328        size_t ret;
 329
 330        qlock(&f->qlock);
 331        if (waserror()) {
 332                qunlock(&f->qlock);
 333                nexterror();
 334        }
 335        ret = __gtfs_fsf_read(f, ubuf, n, off);
 336        qunlock(&f->qlock);
 337        poperror();
 338        return ret;
 339}
 340
 341/* Caller holds the file's qlock. */
 342static size_t __gtfs_fsf_write(struct fs_file *f, void *ubuf, size_t n,
 343                               off64_t off)
 344{
 345        struct gtfs_priv *gp = fsf_to_gtfs_priv(f);
 346        size_t ret;
 347
 348        if (!gp->be_write)
 349                gp->be_write = cclone_and_open(gp->be_walk, O_WRITE);
 350        ret = devtab[gp->be_write->type].write(gp->be_write, ubuf, n, off);
 351        gp->be_length = MAX(gp->be_length, n + ret);
 352        return ret;
 353}
 354
 355/* Writes a file to its backend chan */
 356static size_t gtfs_fsf_write(struct fs_file *f, void *ubuf, size_t n,
 357                             off64_t off)
 358{
 359        ERRSTACK(1);
 360        size_t ret;
 361
 362        qlock(&f->qlock);
 363        if (waserror()) {
 364                qunlock(&f->qlock);
 365                nexterror();
 366        }
 367        ret = __gtfs_fsf_write(f, ubuf, n, off);
 368        qunlock(&f->qlock);
 369        poperror();
 370        return ret;
 371}
 372
 373static size_t gtfs_read(struct chan *c, void *ubuf, size_t n, off64_t off)
 374{
 375        struct tree_file *tf = chan_to_tree_file(c);
 376
 377        if (tree_file_is_dir(tf))
 378                return gtfs_fsf_read(&tf->file, ubuf, n, off);
 379        return fs_file_read(&tf->file, ubuf, n, off);
 380}
 381
 382/* Given a file (with dir->name set), couple it and sync to the backend chan.
 383 * This will store/consume the ref for backend, in the TF (freed with
 384 * gtfs_tf_free), even on error, unless you zero out the be_walk field. */
 385static void gtfs_tf_couple_backend(struct tree_file *tf, struct chan *backend)
 386{
 387        struct dir *dir;
 388        struct gtfs_priv *gp = kzmalloc(sizeof(struct gtfs_priv), MEM_WAIT);
 389
 390        tf->file.priv = gp;
 391        tf->file.dir.qid = backend->qid;
 392        gp->be_walk = backend;
 393        dir = chandirstat(backend);
 394        if (!dir)
 395                error(ENOMEM, "chandirstat failed");
 396        fs_file_copy_from_dir(&tf->file, dir);
 397        kfree(dir);
 398        /* For sync_metadata */
 399        gp->be_length = tf->file.dir.length;
 400        gp->be_mode = tf->file.dir.mode;
 401        gp->be_mtime = tf->file.dir.mtime;
 402}
 403
 404static void gtfs_tf_free(struct tree_file *tf)
 405{
 406        struct gtfs_priv *gp = tf_to_gtfs_priv(tf);
 407
 408        /* Might have some partially / never constructed tree files */
 409        if (!gp)
 410                return;
 411        if (gp->was_removed) {
 412                gp->be_walk->type = -1;
 413                /* sanity */
 414                assert(kref_refcnt(&gp->be_walk->ref) == 1);
 415        }
 416        cclose(gp->be_walk);
 417        /* I/O chans can be NULL */
 418        cclose(gp->be_read);
 419        cclose(gp->be_write);
 420        kfree(gp);
 421}
 422
 423static void gtfs_tf_unlink(struct tree_file *parent, struct tree_file *child)
 424{
 425        struct gtfs_priv *gp = tf_to_gtfs_priv(child);
 426        struct chan *be_walk = gp->be_walk;
 427
 428        /* Remove clunks the be_walk chan/fid.  if it succeeded (and I think
 429         * even if it didn't), we shouldn't close that fid again, which is what
 430         * will happen soon after this function.  The TF code calls unlink, then
 431         * when the last ref closes the TF, it'll get freed and we'll call back
 432         * to gtfs_tf_free().
 433         *
 434         * This is the same issue we run into with all of the device remove ops
 435         * where we want to refcnt something hanging off e.g. c->aux.  In 9p,
 436         * you're not supposed to close a chan/fid that was already removed.
 437         *
 438         * Now here's the weird thing.  We can close the be_walk chan after
 439         * remove, but it's possible that someone has walked and perhaps opened
 440         * a frontend chan + TF, but hasn't done a read yet.  So someone might
 441         * want to set up be_read, but they can't due to be_walk being closed.
 442         * We could give them a 'phase error' (one of 9p's errors for I/O on a
 443         * removed file).
 444         *
 445         * Alternatively, we can mark the gtfs_priv so that when we do free it,
 446         * we skip the dev.remove, similar to what sysremove() does.  That's
 447         * probably easier.  This is technically racy, but we know that the
 448         * release/free method won't be called until we return. */
 449        gp->was_removed = true;
 450        devtab[be_walk->type].remove(be_walk);
 451}
 452
 453/* Caller sets the name, but doesn't know if it exists or not.  It's our job to
 454 * find out if it exists and fill in the child structure appropriately.  For
 455 * negative entries, just flagging it is fine.  Otherwise, we fill in the dir.
 456 * We should throw on error. */
 457static void gtfs_tf_lookup(struct tree_file *parent, struct tree_file *child)
 458{
 459        struct walkqid *wq;
 460        struct chan *be_walk = tf_to_gtfs_priv(parent)->be_walk;
 461        struct chan *child_be_walk;
 462
 463        wq = devtab[be_walk->type].walk(be_walk, NULL, &child->file.dir.name,
 464                                        1);
 465        if (!wq || !wq->clone) {
 466                kfree(wq);
 467                /* This isn't racy, since the child isn't linked to the tree
 468                 * yet. */
 469                child->flags |= TF_F_NEGATIVE | TF_F_HAS_BEEN_USED;
 470                return;
 471        }
 472        /* walk shouldn't give us the same chan struct since we gave it a name
 473         * and a NULL nc. */
 474        assert(wq->clone != be_walk);
 475        /* only gave it one name, and it didn't fail. */
 476        assert(wq->nqid == 1);
 477        /* sanity */
 478        assert(wq->clone->qid.path == wq->qid[wq->nqid - 1].path);
 479        child_be_walk = wq->clone;
 480        kfree(wq);
 481        gtfs_tf_couple_backend(child, child_be_walk);
 482}
 483
 484static void gtfs_tf_create(struct tree_file *parent, struct tree_file *child,
 485                           int perm)
 486{
 487        ERRSTACK(1);
 488        struct chan *c = cclone(tf_to_gtfs_priv(parent)->be_walk);
 489
 490        if (waserror()) {
 491                cclose(c);
 492                nexterror();
 493        }
 494        devtab[c->type].create(c, tree_file_to_name(child), 0, perm,
 495                               child->file.dir.ext);
 496        /* The chan c is opened, which we don't want.  We can't cclone it either
 497         * (since it is opened).  All we can do is have the parent walk again so
 498         * we can get the child's unopened be_walk chan.  Conveniently, that's
 499         * basically a lookup, so create is really two things: make it, then
 500         * look it up from the backend. */
 501        cclose(c);
 502        poperror();
 503        if (waserror()) {
 504                warn("File %s was created in the backend, but unable to look it up!",
 505                     tree_file_to_name(child));
 506                nexterror();
 507        }
 508        gtfs_tf_lookup(parent, child);
 509        poperror();
 510}
 511
 512static void gtfs_wstat_rename(struct fs_file *f, const char *name)
 513{
 514        struct dir dir;
 515
 516        init_empty_dir(&dir);
 517        dir.name = (char*)name;
 518        wstat_dir(f, &dir);
 519}
 520
 521static void gtfs_tf_rename(struct tree_file *tf, struct tree_file *old_parent,
 522                           struct tree_file *new_parent, const char *name,
 523                           int flags)
 524{
 525        struct chan *tf_c = tf_to_gtfs_priv(tf)->be_walk;
 526        struct chan *np_c = tf_to_gtfs_priv(new_parent)->be_walk;
 527
 528        if (!devtab[tf_c->type].rename) {
 529                /* 9p can handle intra-directory renames, though some Akaros
 530                 * #devices might throw. */
 531                if (old_parent == new_parent) {
 532                        gtfs_wstat_rename(&tf->file, name);
 533                        return;
 534                }
 535                error(EXDEV, "%s: %s doesn't support rename", devname(),
 536                      devtab[tf_c->type].name);
 537        }
 538        devtab[tf_c->type].rename(tf_c, np_c, name, flags);
 539}
 540
 541static bool gtfs_tf_has_children(struct tree_file *parent)
 542{
 543        struct dir dir[1];
 544
 545        assert(tree_file_is_dir(parent));       /* TF bug */
 546        /* Any read should work, but there might be issues asking for something
 547         * smaller than a dir.
 548         *
 549         * Note we use the unlocked read here.  The fs_file's qlock is held by
 550         * our caller, and we reuse that qlock for the sync for reading/writing.
 551         */
 552        return __gtfs_fsf_read(&parent->file, dir, sizeof(struct dir), 0) > 0;
 553}
 554
 555struct tree_file_ops gtfs_tf_ops = {
 556        .free = gtfs_tf_free,
 557        .unlink = gtfs_tf_unlink,
 558        .lookup = gtfs_tf_lookup,
 559        .create = gtfs_tf_create,
 560        .rename = gtfs_tf_rename,
 561        .has_children = gtfs_tf_has_children,
 562};
 563
 564/* Fills page with its contents from its backing store file.
 565 *
 566 * Note the page/offset might be beyond the current file length, based on the
 567 * current pagemap code. */
 568static int gtfs_pm_readpage(struct page_map *pm, struct page *pg)
 569{
 570        ERRSTACK(1);
 571        void *kva = page2kva(pg);
 572        off64_t offset = pg->pg_index << PGSHIFT;
 573        size_t ret;
 574
 575        if (waserror()) {
 576                poperror();
 577                return -get_errno();
 578        }
 579        /* If offset is beyond the length of the file, the 9p device/server
 580         * should return 0.  We'll just init an empty page.  The length on the
 581         * frontend (in the fsf->dir.length) will be adjusted.  The backend will
 582         * hear about it on the next sync. */
 583        ret = gtfs_fsf_read(pm->pm_file, kva, PGSIZE, offset);
 584        poperror();
 585        if (ret < PGSIZE)
 586                memset(kva + ret, 0, PGSIZE - ret);
 587        atomic_or(&pg->pg_flags, PG_UPTODATE);
 588        return 0;
 589}
 590
 591/* Meant to take the page from PM and flush to backing store. */
 592static int gtfs_pm_writepage(struct page_map *pm, struct page *pg)
 593{
 594        ERRSTACK(1);
 595        struct fs_file *f = pm->pm_file;
 596        void *kva = page2kva(pg);
 597        off64_t offset = pg->pg_index << PGSHIFT;
 598        size_t amt;
 599
 600        qlock(&f->qlock);
 601        if (waserror()) {
 602                qunlock(&f->qlock);
 603                poperror();
 604                return -get_errno();
 605        }
 606        /* Don't writeback beyond the length of the file.  Most of the time this
 607         * comes up is when the len is in the middle of the last page. */
 608        if (offset >= fs_file_get_length(f)) {
 609                qunlock(&f->qlock);
 610                return 0;
 611        }
 612        amt = MIN(PGSIZE, fs_file_get_length(f) - offset);
 613        __gtfs_fsf_write(f, kva, amt, offset);
 614        qunlock(&f->qlock);
 615        poperror();
 616        return 0;
 617}
 618
 619/* Caller holds the file's qlock */
 620static void __trunc_to(struct fs_file *f, off64_t begin)
 621{
 622        struct gtfs_priv *gp = fsf_to_gtfs_priv(f);
 623        struct dir dir;
 624
 625        init_empty_dir(&dir);
 626        dir.length = begin;
 627        wstat_dir(f, &dir);
 628        /* recall the invariant: be_length == the backend's length */
 629        gp->be_length = begin;
 630}
 631
 632/* Caller holds the file's qlock */
 633static void __zero_fill(struct fs_file *f, off64_t begin, off64_t end)
 634{
 635        ERRSTACK(1);
 636        void *zeros;
 637
 638        if (PGOFF(begin) || PGOFF(end))
 639                error(EINVAL,
 640                      "zero_fill had unaligned begin (%p) or end (%p)\n",
 641                      begin, end);
 642        zeros = kpages_zalloc(PGSIZE, MEM_WAIT);
 643        if (waserror()) {
 644                kpages_free(zeros, PGSIZE);
 645                nexterror();
 646        }
 647        for (off64_t o = begin; o < end; o += PGSIZE)
 648                __gtfs_fsf_write(f, zeros, PGSIZE, o);
 649        poperror();
 650}
 651
 652/* The intent here is for the backend to drop all data in the range.  Zeros are
 653 * OK - any future read should get a zero.
 654 *
 655 * These offsets are the beginning and end of the hole to punch.  The TF code
 656 * already dealt with edge cases, so these happen to be page aligned.  That
 657 * shouldn't matter for the backend device.
 658 *
 659 * Don't worry about a border page for end that is all zeros.
 660 * fs_file_truncate() rounded up to the nearest page to avoid issues.  The user
 661 * could manually punch a hole, and they could create a page of zeros at end.
 662 * We don't care.
 663 *
 664 * 9p doesn't have a hole-punch, so we'll truncate if we can and o/w fill with
 665 * zeros.
 666 *
 667 * Note that the frontend's file length often differs from the backend.  Under
 668 * normal operation, such as writing to a file, the frontend's len will be
 669 * greater than the backend's.  When we sync, the backend learns the real
 670 * length.  Similarly, when we shrink a gile, the backend's length may be
 671 * greater than the frontend.  Consider a truncate from 8192 to 4095: we punch
 672 * with begin = 4096, end = 8192.  In either case, the backend learns the real
 673 * length on a sync.  In punch_hole, we're just trying to discard old data. */
 674static void gtfs_fs_punch_hole(struct fs_file *f, off64_t begin, off64_t end)
 675{
 676        ERRSTACK(1);
 677        struct gtfs_priv *gp = fsf_to_gtfs_priv(f);
 678
 679        qlock(&f->qlock);
 680        if (waserror()) {
 681                qunlock(&f->qlock);
 682                nexterror();
 683        }
 684        if (end >= gp->be_length) {
 685                if (begin < gp->be_length)
 686                        __trunc_to(f, begin);
 687        } else {
 688                __zero_fill(f, begin, end);
 689        }
 690        qunlock(&f->qlock);
 691        poperror();
 692}
 693
 694static bool gtfs_fs_can_grow_to(struct fs_file *f, size_t len)
 695{
 696        /* TODO: are there any limits in 9p? */
 697        return true;
 698}
 699
 700struct fs_file_ops gtfs_fs_ops = {
 701        .readpage = gtfs_pm_readpage,
 702        .writepage = gtfs_pm_writepage,
 703        .punch_hole = gtfs_fs_punch_hole,
 704        .can_grow_to = gtfs_fs_can_grow_to,
 705};
 706
 707/* We're passed a backend chan, usually of type #mnt, used for an uncached
 708 * mount.  We call it 'backend.'  It is the result of an attach, e.g. mntattach.
 709 * In the case of #mnt, this chan is different than the one that has the 9p
 710 * server on the other side, called 'mchan'.  That chan is at backend->mchan,
 711 * and also the struct mnt->c.  The struct mnt is shared by all mounts talking
 712 * to the 9p server over the mchan, and is stored at mchan->mux.  Backend chans
 713 * have a strong (counted) ref on the mchan.
 714 *
 715 * We create and return a chan of #gtfs, suitable for attaching to the
 716 * namespace.  This chan will have the root TF hanging off aux, just like how
 717 * any other attached TFS has a root TF.  #gtfs manages the linkage between a TF
 718 * and the backend, which is the purpose of gtfs_priv.
 719 *
 720 * A note on refcounts: in the normal, uncached operation, the 'backend' chan
 721 * has a ref (actually a chan kref, which you cclose) on the comms chan (mchan).
 722 * We get one ref at mntattach time, and every distinct mntwalk gets another
 723 * ref.  Those actually get closed in chanfree(), since they are stored at
 724 * mchan.
 725 *
 726 * All gtfs *tree_files* have at least one refcounted chan corresponding to the
 727 * file/FID on the backend server.  Think of it as a 1:1 connection, even though
 728 * there is more than one chan.  The gtfs device can have many chans pointing to
 729 * the same TF, which is kreffed.  That TF is 1:1 on a backend object.
 730 *
 731 * All walks from this attach point will get chans with TFs from this TFS and
 732 * will incref the struct gtfs.
 733 */
 734static struct chan *gtfs_attach(char *arg)
 735{
 736        ERRSTACK(2);
 737        struct chan *backend = (struct chan*)arg;
 738        struct chan *frontend;
 739        struct tree_filesystem *tfs;
 740        struct gtfs *gtfs;
 741
 742        frontend = devattach(devname(), 0);
 743        if (waserror()) {
 744                /* same as #mnt - don't cclose, since we don't want to devtab
 745                 * close, and we know the ref == 1 here. */
 746                chanfree(frontend);
 747                nexterror();
 748        }
 749        gtfs = kzmalloc(sizeof(struct gtfs), MEM_WAIT);
 750        /* This 'users' kref is the one that every distinct frontend chan has.
 751         * These come from attaches and successful, 'moving' walks. */
 752        kref_init(&gtfs->users, gtfs_release, 1);
 753        tfs = (struct tree_filesystem*)gtfs;
 754        /* This gives us one ref on root, released during gtfs_release().  name
 755         * is set to ".", though that gets overwritten during coupling. */
 756        tfs_init(tfs);
 757        if (waserror()) {
 758                /* don't consume the backend ref on error, caller expects to
 759                 * have it */
 760                tf_to_gtfs_priv(tfs->root)->be_walk = NULL;
 761                /* ref from tfs_init.  this should free the TF. */
 762                tf_kref_put(tfs->root);
 763                tfs_destroy(tfs);
 764                kfree(gtfs);
 765                nexterror();
 766        }
 767        /* stores the ref for 'backend' inside tfs->root */
 768        gtfs_tf_couple_backend(tfs->root, backend);
 769        poperror();
 770        tfs->tf_ops = gtfs_tf_ops;
 771        tfs->fs_ops = gtfs_fs_ops;
 772        /* need another ref on root for the frontend chan */
 773        tf_kref_get(tfs->root);
 774        chan_set_tree_file(frontend, tfs->root);
 775        poperror();
 776        return frontend;
 777}
 778
 779static bool lru_prune_cb(struct tree_file *tf)
 780{
 781        ERRSTACK(1);
 782
 783        if (waserror()) {
 784                /* not much to do - ssh the file out? */
 785                printk("Failed to sync file %s: %s\n", tree_file_to_name(tf),
 786                       current_errstr());
 787                poperror();
 788                return false;
 789        }
 790        writeback_file(&tf->file);
 791        poperror();
 792        return true;
 793}
 794
 795static void pressure_dfs_cb(struct tree_file *tf)
 796{
 797        if (!tree_file_is_dir(tf))
 798                pm_free_unused_pages(tf->file.pm);
 799}
 800
 801/* Under memory pressure, there are a bunch of things we can do. */
 802static void gtfs_free_memory(struct gtfs *gtfs)
 803{
 804        /* This attempts to remove every file from the LRU.  It'll write back
 805         * dirty files, then if they haven't been used since we started, it'll
 806         * delete the frontend TF, which will delete the entire page cache
 807         * entry.  The heavy lifting is done by TF code. */
 808        tfs_lru_for_each(&gtfs->tfs, lru_prune_cb, -1);
 809        /* This drops the negative TFs.  It's not a huge deal, since they are
 810         * small, but perhaps it'll help. */
 811        tfs_lru_prune_neg(&gtfs->tfs);
 812        /* This will attempt to free memory from all files in the frontend,
 813         * regardless of whether or not they are in use.  This might help if you
 814         * have some large files that happened to be open. */
 815        tfs_frontend_for_each(&gtfs->tfs, pressure_dfs_cb);
 816}
 817
 818static void gtfs_sync_tf(struct tree_file *tf)
 819{
 820        writeback_file(&tf->file);
 821}
 822
 823static void gtfs_sync_gtfs(struct gtfs *gtfs)
 824{
 825        tfs_frontend_for_each(&gtfs->tfs, gtfs_sync_tf);
 826}
 827
 828static void gtfs_sync_chan(struct chan *c)
 829{
 830        gtfs_sync_tf(chan_to_tree_file(c));
 831}
 832
 833static void gtfs_sync_chans_fs(struct chan *any_c)
 834{
 835        gtfs_sync_gtfs(chan_to_gtfs(any_c));
 836}
 837
 838static unsigned long gtfs_chan_ctl(struct chan *c, int op, unsigned long a1,
 839                                   unsigned long a2, unsigned long a3,
 840                                   unsigned long a4)
 841{
 842        switch (op) {
 843        case CCTL_SYNC:
 844                if (tree_file_is_dir(chan_to_tree_file(c)))
 845                        gtfs_sync_chans_fs(c);
 846                else
 847                        gtfs_sync_chan(c);
 848                return 0;
 849        default:
 850                return tree_chan_ctl(c, op, a1, a2, a3, a4);
 851        }
 852}
 853
 854struct dev gtfs_devtab __devtab = {
 855        .name = "gtfs",
 856
 857        .reset = devreset,
 858        .init = devinit,
 859        .shutdown = devshutdown,
 860        .attach = gtfs_attach,
 861        .walk = gtfs_walk,
 862        .stat = tree_chan_stat,
 863        .open = gtfs_open,
 864        .create = gtfs_create,
 865        .close = gtfs_close,
 866        .read = gtfs_read,
 867        .bread = devbread,
 868        .write = tree_chan_write,
 869        .bwrite = devbwrite,
 870        .remove = gtfs_remove,
 871        .rename = tree_chan_rename,
 872        .wstat = gtfs_wstat,
 873        .power = devpower,
 874        .chaninfo = devchaninfo,
 875        .mmap = tree_chan_mmap,
 876        .chan_ctl = gtfs_chan_ctl,
 877};
 878