diff options
67 files changed, 28066 insertions, 0 deletions
diff --git a/Documentation/filesystems/ceph.txt b/Documentation/filesystems/ceph.txt new file mode 100644 index 00000000000..6e03917316b --- /dev/null +++ b/Documentation/filesystems/ceph.txt @@ -0,0 +1,139 @@ +Ceph Distributed File System +============================ + +Ceph is a distributed network file system designed to provide good +performance, reliability, and scalability. + +Basic features include: + + * POSIX semantics + * Seamless scaling from 1 to many thousands of nodes + * High availability and reliability. No single points of failure. + * N-way replication of data across storage nodes + * Fast recovery from node failures + * Automatic rebalancing of data on node addition/removal + * Easy deployment: most FS components are userspace daemons + +Also, + * Flexible snapshots (on any directory) + * Recursive accounting (nested files, directories, bytes) + +In contrast to cluster filesystems like GFS, OCFS2, and GPFS that rely +on symmetric access by all clients to shared block devices, Ceph +separates data and metadata management into independent server +clusters, similar to Lustre. Unlike Lustre, however, metadata and +storage nodes run entirely as user space daemons. Storage nodes +utilize btrfs to store data objects, leveraging its advanced features +(checksumming, metadata replication, etc.). File data is striped +across storage nodes in large chunks to distribute workload and +facilitate high throughputs. When storage nodes fail, data is +re-replicated in a distributed fashion by the storage nodes themselves +(with some minimal coordination from a cluster monitor), making the +system extremely efficient and scalable. + +Metadata servers effectively form a large, consistent, distributed +in-memory cache above the file namespace that is extremely scalable, +dynamically redistributes metadata in response to workload changes, +and can tolerate arbitrary (well, non-Byzantine) node failures. The +metadata server takes a somewhat unconventional approach to metadata +storage to significantly improve performance for common workloads. In +particular, inodes with only a single link are embedded in +directories, allowing entire directories of dentries and inodes to be +loaded into its cache with a single I/O operation. The contents of +extremely large directories can be fragmented and managed by +independent metadata servers, allowing scalable concurrent access. + +The system offers automatic data rebalancing/migration when scaling +from a small cluster of just a few nodes to many hundreds, without +requiring an administrator carve the data set into static volumes or +go through the tedious process of migrating data between servers. +When the file system approaches full, new nodes can be easily added +and things will "just work." + +Ceph includes flexible snapshot mechanism that allows a user to create +a snapshot on any subdirectory (and its nested contents) in the +system. Snapshot creation and deletion are as simple as 'mkdir +.snap/foo' and 'rmdir .snap/foo'. + +Ceph also provides some recursive accounting on directories for nested +files and bytes. That is, a 'getfattr -d foo' on any directory in the +system will reveal the total number of nested regular files and +subdirectories, and a summation of all nested file sizes. This makes +the identification of large disk space consumers relatively quick, as +no 'du' or similar recursive scan of the file system is required. + + +Mount Syntax +============ + +The basic mount syntax is: + + # mount -t ceph monip[:port][,monip2[:port]...]:/[subdir] mnt + +You only need to specify a single monitor, as the client will get the +full list when it connects. (However, if the monitor you specify +happens to be down, the mount won't succeed.) The port can be left +off if the monitor is using the default. So if the monitor is at +1.2.3.4, + + # mount -t ceph 1.2.3.4:/ /mnt/ceph + +is sufficient. If /sbin/mount.ceph is installed, a hostname can be +used instead of an IP address. + + + +Mount Options +============= + + ip=A.B.C.D[:N] + Specify the IP and/or port the client should bind to locally. + There is normally not much reason to do this. If the IP is not + specified, the client's IP address is determined by looking at the + address it's connection to the monitor originates from. + + wsize=X + Specify the maximum write size in bytes. By default there is no + maximu. Ceph will normally size writes based on the file stripe + size. + + rsize=X + Specify the maximum readahead. + + mount_timeout=X + Specify the timeout value for mount (in seconds), in the case + of a non-responsive Ceph file system. The default is 30 + seconds. + + rbytes + When stat() is called on a directory, set st_size to 'rbytes', + the summation of file sizes over all files nested beneath that + directory. This is the default. + + norbytes + When stat() is called on a directory, set st_size to the + number of entries in that directory. + + nocrc + Disable CRC32C calculation for data writes. If set, the OSD + must rely on TCP's error correction to detect data corruption + in the data payload. + + noasyncreaddir + Disable client's use its local cache to satisfy readdir + requests. (This does not change correctness; the client uses + cached metadata only when a lease or capability ensures it is + valid.) + + +More Information +================ + +For more information on Ceph, see the home page at + http://ceph.newdream.net/ + +The Linux kernel client source tree is available at + git://ceph.newdream.net/linux-ceph-client.git + +and the source for the full system is at + git://ceph.newdream.net/ceph.git diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt index 35c9b51d20e..dd5806f4fcc 100644 --- a/Documentation/ioctl/ioctl-number.txt +++ b/Documentation/ioctl/ioctl-number.txt @@ -291,6 +291,7 @@ Code Seq#(hex) Include File Comments 0x92 00-0F drivers/usb/mon/mon_bin.c 0x93 60-7F linux/auto_fs.h 0x94 all fs/btrfs/ioctl.h +0x97 00-7F fs/ceph/ioctl.h Ceph file system 0x99 00-0F 537-Addinboard driver <mailto:buk@buks.ipn.de> 0xA0 all linux/sdp/sdp.h Industrial Device Project diff --git a/MAINTAINERS b/MAINTAINERS index 382eaa4d006..449d4440208 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1441,6 +1441,15 @@ F: arch/powerpc/include/asm/spu*.h F: arch/powerpc/oprofile/*cell* F: arch/powerpc/platforms/cell/ +CEPH DISTRIBUTED FILE SYSTEM CLIENT +M: Sage Weil <sage@newdream.net> +L: ceph-devel@lists.sourceforge.net +W: http://ceph.newdream.net/ +T: git git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git +S: Supported +F: Documentation/filesystems/ceph.txt +F: fs/ceph + CERTIFIED WIRELESS USB (WUSB) SUBSYSTEM: M: David Vrabel <david.vrabel@csr.com> L: linux-usb@vger.kernel.org diff --git a/fs/Kconfig b/fs/Kconfig index 7405f071be6..5f85b594761 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -235,6 +235,7 @@ config NFS_COMMON source "net/sunrpc/Kconfig" source "fs/smbfs/Kconfig" +source "fs/ceph/Kconfig" source "fs/cifs/Kconfig" source "fs/ncpfs/Kconfig" source "fs/coda/Kconfig" diff --git a/fs/Makefile b/fs/Makefile index c3633aa4691..97f340f14ba 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -125,3 +125,4 @@ obj-$(CONFIG_OCFS2_FS) += ocfs2/ obj-$(CONFIG_BTRFS_FS) += btrfs/ obj-$(CONFIG_GFS2_FS) += gfs2/ obj-$(CONFIG_EXOFS_FS) += exofs/ +obj-$(CONFIG_CEPH_FS) += ceph/ diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig new file mode 100644 index 00000000000..04b8280582a --- /dev/null +++ b/fs/ceph/Kconfig @@ -0,0 +1,27 @@ +config CEPH_FS + tristate "Ceph distributed file system (EXPERIMENTAL)" + depends on INET && EXPERIMENTAL + select LIBCRC32C + select CONFIG_CRYPTO_AES + help + Choose Y or M here to include support for mounting the + experimental Ceph distributed file system. Ceph is an extremely + scalable file system designed to provide high performance, + reliable access to petabytes of storage. + + More information at http://ceph.newdream.net/. + + If unsure, say N. + +config CEPH_FS_PRETTYDEBUG + bool "Include file:line in ceph debug output" + depends on CEPH_FS + default n + help + If you say Y here, debug output will include a filename and + line to aid debugging. This icnreases kernel size and slows + execution slightly when debug call sites are enabled (e.g., + via CONFIG_DYNAMIC_DEBUG). + + If unsure, say N. + diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile new file mode 100644 index 00000000000..6a660e610be --- /dev/null +++ b/fs/ceph/Makefile @@ -0,0 +1,39 @@ +# +# Makefile for CEPH filesystem. +# + +ifneq ($(KERNELRELEASE),) + +obj-$(CONFIG_CEPH_FS) += ceph.o + +ceph-objs := super.o inode.o dir.o file.o addr.o ioctl.o \ + export.o caps.o snap.o xattr.o \ + messenger.o msgpool.o buffer.o pagelist.o \ + mds_client.o mdsmap.o \ + mon_client.o \ + osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \ + debugfs.o \ + auth.o auth_none.o \ + crypto.o armor.o \ + auth_x.o \ + ceph_fs.o ceph_strings.o ceph_hash.o ceph_frag.o + +else +#Otherwise we were called directly from the command +# line; invoke the kernel build system. + +KERNELDIR ?= /lib/modules/$(shell uname -r)/build +PWD := $(shell pwd) + +default: all + +all: + $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules + +modules_install: + $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules_install + +clean: + $(MAKE) -C $(KERNELDIR) M=$(PWD) clean + +endif diff --git a/fs/ceph/README b/fs/ceph/README new file mode 100644 index 00000000000..18352fab37c --- /dev/null +++ b/fs/ceph/README @@ -0,0 +1,20 @@ +# +# The following files are shared by (and manually synchronized +# between) the Ceph userland and kernel client. +# +# userland kernel +src/include/ceph_fs.h fs/ceph/ceph_fs.h +src/include/ceph_fs.cc fs/ceph/ceph_fs.c +src/include/msgr.h fs/ceph/msgr.h +src/include/rados.h fs/ceph/rados.h +src/include/ceph_strings.cc fs/ceph/ceph_strings.c +src/include/ceph_frag.h fs/ceph/ceph_frag.h +src/include/ceph_frag.cc fs/ceph/ceph_frag.c +src/include/ceph_hash.h fs/ceph/ceph_hash.h +src/include/ceph_hash.cc fs/ceph/ceph_hash.c +src/crush/crush.c fs/ceph/crush/crush.c +src/crush/crush.h fs/ceph/crush/crush.h +src/crush/mapper.c fs/ceph/crush/mapper.c +src/crush/mapper.h fs/ceph/crush/mapper.h +src/crush/hash.h fs/ceph/crush/hash.h +src/crush/hash.c fs/ceph/crush/hash.c diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c new file mode 100644 index 00000000000..23bb0ceabe3 --- /dev/null +++ b/fs/ceph/addr.c @@ -0,0 +1,1188 @@ +#include "ceph_debug.h" + +#include <linux/backing-dev.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/writeback.h> /* generic_writepages */ +#include <linux/pagevec.h> +#include <linux/task_io_accounting_ops.h> + +#include "super.h" +#include "osd_client.h" + +/* + * Ceph address space ops. + * + * There are a few funny things going on here. + * + * The page->private field is used to reference a struct + * ceph_snap_context for _every_ dirty page. This indicates which + * snapshot the page was logically dirtied in, and thus which snap + * context needs to be associated with the osd write during writeback. + * + * Similarly, struct ceph_inode_info maintains a set of counters to + * count dirty pages on the inode. In the absense of snapshots, + * i_wrbuffer_ref == i_wrbuffer_ref_head == the dirty page count. + * + * When a snapshot is taken (that is, when the client receives + * notification that a snapshot was taken), each inode with caps and + * with dirty pages (dirty pages implies there is a cap) gets a new + * ceph_cap_snap in the i_cap_snaps list (which is sorted in ascending + * order, new snaps go to the tail). The i_wrbuffer_ref_head count is + * moved to capsnap->dirty. (Unless a sync write is currently in + * progress. In that case, the capsnap is said to be "pending", new + * writes cannot start, and the capsnap isn't "finalized" until the + * write completes (or fails) and a final size/mtime for the inode for + * that snap can be settled upon.) i_wrbuffer_ref_head is reset to 0. + * + * On writeback, we must submit writes to the osd IN SNAP ORDER. So, + * we look for the first capsnap in i_cap_snaps and write out pages in + * that snap context _only_. Then we move on to the next capsnap, + * eventually reaching the "live" or "head" context (i.e., pages that + * are not yet snapped) and are writing the most recently dirtied + * pages. + * + * Invalidate and so forth must take care to ensure the dirty page + * accounting is preserved. + */ + +#define CONGESTION_ON_THRESH(congestion_kb) (congestion_kb >> (PAGE_SHIFT-10)) +#define CONGESTION_OFF_THRESH(congestion_kb) \ + (CONGESTION_ON_THRESH(congestion_kb) - \ + (CONGESTION_ON_THRESH(congestion_kb) >> 2)) + + + +/* + * Dirty a page. Optimistically adjust accounting, on the assumption + * that we won't race with invalidate. If we do, readjust. + */ +static int ceph_set_page_dirty(struct page *page) +{ + struct address_space *mapping = page->mapping; + struct inode *inode; + struct ceph_inode_info *ci; + int undo = 0; + struct ceph_snap_context *snapc; + + if (unlikely(!mapping)) + return !TestSetPageDirty(page); + + if (TestSetPageDirty(page)) { + dout("%p set_page_dirty %p idx %lu -- already dirty\n", + mapping->host, page, page->index); + return 0; + } + + inode = mapping->host; + ci = ceph_inode(inode); + + /* + * Note that we're grabbing a snapc ref here without holding + * any locks! + */ + snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context); + + /* dirty the head */ + spin_lock(&inode->i_lock); + if (ci->i_wrbuffer_ref_head == 0) + ci->i_head_snapc = ceph_get_snap_context(snapc); + ++ci->i_wrbuffer_ref_head; + if (ci->i_wrbuffer_ref == 0) + igrab(inode); + ++ci->i_wrbuffer_ref; + dout("%p set_page_dirty %p idx %lu head %d/%d -> %d/%d " + "snapc %p seq %lld (%d snaps)\n", + mapping->host, page, page->index, + ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1, + ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head, + snapc, snapc->seq, snapc->num_snaps); + spin_unlock(&inode->i_lock); + + /* now adjust page */ + spin_lock_irq(&mapping->tree_lock); + if (page->mapping) { /* Race with truncate? */ + WARN_ON_ONCE(!PageUptodate(page)); + + if (mapping_cap_account_dirty(mapping)) { + __inc_zone_page_state(page, NR_FILE_DIRTY); + __inc_bdi_stat(mapping->backing_dev_info, + BDI_RECLAIMABLE); + task_io_account_write(PAGE_CACHE_SIZE); + } + radix_tree_tag_set(&mapping->page_tree, + page_index(page), PAGECACHE_TAG_DIRTY); + + /* + * Reference snap context in page->private. Also set + * PagePrivate so that we get invalidatepage callback. + */ + page->private = (unsigned long)snapc; + SetPagePrivate(page); + } else { + dout("ANON set_page_dirty %p (raced truncate?)\n", page); + undo = 1; + } + + spin_unlock_irq(&mapping->tree_lock); + + if (undo) + /* whoops, we failed to dirty the page */ + ceph_put_wrbuffer_cap_refs(ci, 1, snapc); + + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); + + BUG_ON(!PageDirty(page)); + return 1; +} + +/* + * If we are truncating the full page (i.e. offset == 0), adjust the + * dirty page counters appropriately. Only called if there is private + * data on the page. + */ +static void ceph_invalidatepage(struct page *page, unsigned long offset) +{ + struct inode *inode; + struct ceph_inode_info *ci; + struct ceph_snap_context *snapc = (void *)page->private; + + BUG_ON(!PageLocked(page)); + BUG_ON(!page->private); + BUG_ON(!PagePrivate(page)); + BUG_ON(!page->mapping); + + inode = page->mapping->host; + + /* + * We can get non-dirty pages here due to races between + * set_page_dirty and truncate_complete_page; just spit out a + * warning, in case we end up with accounting problems later. + */ + if (!PageDirty(page)) + pr_err("%p invalidatepage %p page not dirty\n", inode, page); + + if (offset == 0) + ClearPageChecked(page); + + ci = ceph_inode(inode); + if (offset == 0) { + dout("%p invalidatepage %p idx %lu full dirty page %lu\n", + inode, page, page->index, offset); + ceph_put_wrbuffer_cap_refs(ci, 1, snapc); + ceph_put_snap_context(snapc); + page->private = 0; + ClearPagePrivate(page); + } else { + dout("%p invalidatepage %p idx %lu partial dirty page\n", + inode, page, page->index); + } +} + +/* just a sanity check */ +static int ceph_releasepage(struct page *page, gfp_t g) +{ + struct inode *inode = page->mapping ? page->mapping->host : NULL; + dout("%p releasepage %p idx %lu\n", inode, page, page->index); + WARN_ON(PageDirty(page)); + WARN_ON(page->private); + WARN_ON(PagePrivate(page)); + return 0; +} + +/* + * read a single page, without unlocking it. + */ +static int readpage_nounlock(struct file *filp, struct page *page) +{ + struct inode *inode = filp->f_dentry->d_inode; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc; + int err = 0; + u64 len = PAGE_CACHE_SIZE; + + dout("readpage inode %p file %p page %p index %lu\n", + inode, filp, page, page->index); + err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, + page->index << PAGE_CACHE_SHIFT, &len, + ci->i_truncate_seq, ci->i_truncate_size, + &page, 1); + if (err == -ENOENT) + err = 0; + if (err < 0) { + SetPageError(page); + goto out; + } else if (err < PAGE_CACHE_SIZE) { + /* zero fill remainder of page */ + zero_user_segment(page, err, PAGE_CACHE_SIZE); + } + SetPageUptodate(page); + +out: + return err < 0 ? err : 0; +} + +static int ceph_readpage(struct file *filp, struct page *page) +{ + int r = readpage_nounlock(filp, page); + unlock_page(page); + return r; +} + +/* + * Build a vector of contiguous pages from the provided page list. + */ +static struct page **page_vector_from_list(struct list_head *page_list, + unsigned *nr_pages) +{ + struct page **pages; + struct page *page; + int next_index, contig_pages = 0; + + /* build page vector */ + pages = kmalloc(sizeof(*pages) * *nr_pages, GFP_NOFS); + if (!pages) + return ERR_PTR(-ENOMEM); + + BUG_ON(list_empty(page_list)); + next_index = list_entry(page_list->prev, struct page, lru)->index; + list_for_each_entry_reverse(page, page_list, lru) { + if (page->index == next_index) { + dout("readpages page %d %p\n", contig_pages, page); + pages[contig_pages] = page; + contig_pages++; + next_index++; + } else { + break; + } + } + *nr_pages = contig_pages; + return pages; +} + +/* + * Read multiple pages. Leave pages we don't read + unlock in page_list; + * the caller (VM) cleans them up. + */ +static int ceph_readpages(struct file *file, struct address_space *mapping, + struct list_head *page_list, unsigned nr_pages) +{ + struct inode *inode = file->f_dentry->d_inode; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc; + int rc = 0; + struct page **pages; + struct pagevec pvec; + loff_t offset; + u64 len; + + dout("readpages %p file %p nr_pages %d\n", + inode, file, nr_pages); + + pages = page_vector_from_list(page_list, &nr_pages); + if (IS_ERR(pages)) + return PTR_ERR(pages); + + /* guess read extent */ + offset = pages[0]->index << PAGE_CACHE_SHIFT; + len = nr_pages << PAGE_CACHE_SHIFT; + rc = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, + offset, &len, + ci->i_truncate_seq, ci->i_truncate_size, + pages, nr_pages); + if (rc == -ENOENT) + rc = 0; + if (rc < 0) + goto out; + + /* set uptodate and add to lru in pagevec-sized chunks */ + pagevec_init(&pvec, 0); + for (; !list_empty(page_list) && len > 0; + rc -= PAGE_CACHE_SIZE, len -= PAGE_CACHE_SIZE) { + struct page *page = + list_entry(page_list->prev, struct page, lru); + + list_del(&page->lru); + + if (rc < (int)PAGE_CACHE_SIZE) { + /* zero (remainder of) page */ + int s = rc < 0 ? 0 : rc; + zero_user_segment(page, s, PAGE_CACHE_SIZE); + } + + if (add_to_page_cache(page, mapping, page->index, GFP_NOFS)) { + page_cache_release(page); + dout("readpages %p add_to_page_cache failed %p\n", + inode, page); + continue; + } + dout("readpages %p adding %p idx %lu\n", inode, page, + page->index); + flush_dcache_page(page); + SetPageUptodate(page); + unlock_page(page); + if (pagevec_add(&pvec, page) == 0) + pagevec_lru_add_file(&pvec); /* add to lru */ + } + pagevec_lru_add_file(&pvec); + rc = 0; + +out: + kfree(pages); + return rc; +} + +/* + * Get ref for the oldest snapc for an inode with dirty data... that is, the + * only snap context we are allowed to write back. + * + * Caller holds i_lock. + */ +static struct ceph_snap_context *__get_oldest_context(struct inode *inode, + u64 *snap_size) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_snap_context *snapc = NULL; + struct ceph_cap_snap *capsnap = NULL; + + list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { + dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap, + capsnap->context, capsnap->dirty_pages); + if (capsnap->dirty_pages) { + snapc = ceph_get_snap_context(capsnap->context); + if (snap_size) + *snap_size = capsnap->size; + break; + } + } + if (!snapc && ci->i_snap_realm) { + snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context); + dout(" head snapc %p has %d dirty pages\n", + snapc, ci->i_wrbuffer_ref_head); + } + return snapc; +} + +static struct ceph_snap_context *get_oldest_context(struct inode *inode, + u64 *snap_size) +{ + struct ceph_snap_context *snapc = NULL; + + spin_lock(&inode->i_lock); + snapc = __get_oldest_context(inode, snap_size); + spin_unlock(&inode->i_lock); + return snapc; +} + +/* + * Write a single page, but leave the page locked. + * + * If we get a write error, set the page error bit, but still adjust the + * dirty page accounting (i.e., page is no longer dirty). + */ +static int writepage_nounlock(struct page *page, struct writeback_control *wbc) +{ + struct inode *inode; + struct ceph_inode_info *ci; + struct ceph_client *client; + struct ceph_osd_client *osdc; + loff_t page_off = page->index << PAGE_CACHE_SHIFT; + int len = PAGE_CACHE_SIZE; + loff_t i_size; + int err = 0; + struct ceph_snap_context *snapc; + u64 snap_size = 0; + long writeback_stat; + + dout("writepage %p idx %lu\n", page, page->index); + + if (!page->mapping || !page->mapping->host) { + dout("writepage %p - no mapping\n", page); + return -EFAULT; + } + inode = page->mapping->host; + ci = ceph_inode(inode); + client = ceph_inode_to_client(inode); + osdc = &client->osdc; + + /* verify this is a writeable snap context */ + snapc = (void *)page->private; + if (snapc == NULL) { + dout("writepage %p page %p not dirty?\n", inode, page); + goto out; + } + if (snapc != get_oldest_context(inode, &snap_size)) { + dout("writepage %p page %p snapc %p not writeable - noop\n", + inode, page, (void *)page->private); + /* we should only noop if called by kswapd */ + WARN_ON((current->flags & PF_MEMALLOC) == 0); + goto out; + } + + /* is this a partial page at end of file? */ + if (snap_size) + i_size = snap_size; + else + i_size = i_size_read(inode); + if (i_size < page_off + len) + len = i_size - page_off; + + dout("writepage %p page %p index %lu on %llu~%u\n", + inode, page, page->index, page_off, len); + + writeback_stat = atomic_long_inc_return(&client->writeback_count); + if (writeback_stat > + CONGESTION_ON_THRESH(client->mount_args->congestion_kb)) + set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC); + + set_page_writeback(page); + err = ceph_osdc_writepages(osdc, ceph_vino(inode), + &ci->i_layout, snapc, + page_off, len, + ci->i_truncate_seq, ci->i_truncate_size, + &inode->i_mtime, + &page, 1, 0, 0, true); + if (err < 0) { + dout("writepage setting page/mapping error %d %p\n", err, page); + SetPageError(page); + mapping_set_error(&inode->i_data, err); + if (wbc) + wbc->pages_skipped++; + } else { + dout("writepage cleaned page %p\n", page); + err = 0; /* vfs expects us to return 0 */ + } + page->private = 0; + ClearPagePrivate(page); + end_page_writeback(page); + ceph_put_wrbuffer_cap_refs(ci, 1, snapc); + ceph_put_snap_context(snapc); +out: + return err; +} + +static int ceph_writepage(struct page *page, struct writeback_control *wbc) +{ + int err; + struct inode *inode = page->mapping->host; + BUG_ON(!inode); + igrab(inode); + err = writepage_nounlock(page, wbc); + unlock_page(page); + iput(inode); + return err; +} + + +/* + * lame release_pages helper. release_pages() isn't exported to + * modules. + */ +static void ceph_release_pages(struct page **pages, int num) +{ + struct pagevec pvec; + int i; + + pagevec_init(&pvec, 0); + for (i = 0; i < num; i++) { + if (pagevec_add(&pvec, pages[i]) == 0) + pagevec_release(&pvec); + } + pagevec_release(&pvec); +} + + +/* + * async writeback completion handler. + * + * If we get an error, set the mapping error bit, but not the individual + * page error bits. + */ +static void writepages_finish(struct ceph_osd_request *req, + struct ceph_msg *msg) +{ + struct inode *inode = req->r_inode; + struct ceph_osd_reply_head *replyhead; + struct ceph_osd_op *op; + struct ceph_inode_info *ci = ceph_inode(inode); + unsigned wrote; + struct page *page; + int i; + struct ceph_snap_context *snapc = req->r_snapc; + struct address_space *mapping = inode->i_mapping; + struct writeback_control *wbc = req->r_wbc; + __s32 rc = -EIO; + u64 bytes = 0; + struct ceph_client *client = ceph_inode_to_client(inode); + long writeback_stat; + unsigned issued = __ceph_caps_issued(ci, NULL); + + /* parse reply */ + replyhead = msg->front.iov_base; + WARN_ON(le32_to_cpu(replyhead->num_ops) == 0); + op = (void *)(replyhead + 1); + rc = le32_to_cpu(replyhead->result); + bytes = le64_to_cpu(op->extent.length); + + if (rc >= 0) { + /* + * Assume we wrote the pages we originally sent. The + * osd might reply with fewer pages if our writeback + * raced with a truncation and was adjusted at the osd, + * so don't believe the reply. + */ + wrote = req->r_num_pages; + } else { + wrote = 0; + mapping_set_error(mapping, rc); + } + dout("writepages_finish %p rc %d bytes %llu wrote %d (pages)\n", + inode, rc, bytes, wrote); + + /* clean all pages */ + for (i = 0; i < req->r_num_pages; i++) { + page = req->r_pages[i]; + BUG_ON(!page); + WARN_ON(!PageUptodate(page)); + + writeback_stat = + atomic_long_dec_return(&client->writeback_count); + if (writeback_stat < + CONGESTION_OFF_THRESH(client->mount_args->congestion_kb)) + clear_bdi_congested(&client->backing_dev_info, + BLK_RW_ASYNC); + + if (i >= wrote) { + dout("inode %p skipping page %p\n", inode, page); + wbc->pages_skipped++; + } + page->private = 0; + ClearPagePrivate(page); + ceph_put_snap_context(snapc); + dout("unlocking %d %p\n", i, page); + end_page_writeback(page); + + /* + * We lost the cache cap, need to truncate the page before + * it is unlocked, otherwise we'd truncate it later in the + * page truncation thread, possibly losing some data that + * raced its way in + */ + if ((issued & CEPH_CAP_FILE_CACHE) == 0) + generic_error_remove_page(inode->i_mapping, page); + + unlock_page(page); + } + dout("%p wrote+cleaned %d pages\n", inode, wrote); + ceph_put_wrbuffer_cap_refs(ci, req->r_num_pages, snapc); + + ceph_release_pages(req->r_pages, req->r_num_pages); + if (req->r_pages_from_pool) + mempool_free(req->r_pages, + ceph_client(inode->i_sb)->wb_pagevec_pool); + else + kfree(req->r_pages); + ceph_osdc_put_request(req); +} + +/* + * allocate a page vec, either directly, or if necessary, via a the + * mempool. we avoid the mempool if we can because req->r_num_pages + * may be less than the maximum write size. + */ +static void alloc_page_vec(struct ceph_client *client, + struct ceph_osd_request *req) +{ + req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages, + GFP_NOFS); + if (!req->r_pages) { + req->r_pages = mempool_alloc(client->wb_pagevec_pool, GFP_NOFS); + req->r_pages_from_pool = 1; + WARN_ON(!req->r_pages); + } +} + +/* + * initiate async writeback + */ +static int ceph_writepages_start(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct inode *inode = mapping->host; + struct backing_dev_info *bdi = mapping->backing_dev_info; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_client *client; + pgoff_t index, start, end; + int range_whole = 0; + int should_loop = 1; + pgoff_t max_pages = 0, max_pages_ever = 0; + struct ceph_snap_context *snapc = NULL, *last_snapc = NULL; + struct pagevec pvec; + int done = 0; + int rc = 0; + unsigned wsize = 1 << inode->i_blkbits; + struct ceph_osd_request *req = NULL; + int do_sync; + u64 snap_size = 0; + + /* + * Include a 'sync' in the OSD request if this is a data + * integrity write (e.g., O_SYNC write or fsync()), or if our + * cap is being revoked. + */ + do_sync = wbc->sync_mode == WB_SYNC_ALL; + if (ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER)) + do_sync = 1; + dout("writepages_start %p dosync=%d (mode=%s)\n", + inode, do_sync, + wbc->sync_mode == WB_SYNC_NONE ? "NONE" : + (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); + + client = ceph_inode_to_client(inode); + if (client->mount_state == CEPH_MOUNT_SHUTDOWN) { + pr_warning("writepage_start %p on forced umount\n", inode); + return -EIO; /* we're in a forced umount, don't write! */ + } + if (client->mount_args->wsize && client->mount_args->wsize < wsize) + wsize = client->mount_args->wsize; + if (wsize < PAGE_CACHE_SIZE) + wsize = PAGE_CACHE_SIZE; + max_pages_ever = wsize >> PAGE_CACHE_SHIFT; + + pagevec_init(&pvec, 0); + + /* ?? */ + if (wbc->nonblocking && bdi_write_congested(bdi)) { + dout(" writepages congested\n"); + wbc->encountered_congestion = 1; + goto out_final; + } + + /* where to start/end? */ + if (wbc->range_cyclic) { + start = mapping->writeback_index; /* Start from prev offset */ + end = -1; + dout(" cyclic, start at %lu\n", start); + } else { + start = wbc->range_start >> PAGE_CACHE_SHIFT; + end = wbc->range_end >> PAGE_CACHE_SHIFT; + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = 1; + should_loop = 0; + dout(" not cyclic, %lu to %lu\n", start, end); + } + index = start; + +retry: + /* find oldest snap context with dirty data */ + ceph_put_snap_context(snapc); + snapc = get_oldest_context(inode, &snap_size); + if (!snapc) { + /* hmm, why does writepages get called when there + is no dirty data? */ + dout(" no snap context with dirty data?\n"); + goto out; + } + dout(" oldest snapc is %p seq %lld (%d snaps)\n", + snapc, snapc->seq, snapc->num_snaps); + if (last_snapc && snapc != last_snapc) { + /* if we switched to a newer snapc, restart our scan at the + * start of the original file range. */ + dout(" snapc differs from last pass, restarting at %lu\n", + index); + index = start; + } + last_snapc = snapc; + + while (!done && index <= end) { + unsigned i; + int first; + pgoff_t next; + int pvec_pages, locked_pages; + struct page *page; + int want; + u64 offset, len; + struct ceph_osd_request_head *reqhead; + struct ceph_osd_op *op; + long writeback_stat; + + next = 0; + locked_pages = 0; + max_pages = max_pages_ever; + +get_more_pages: + first = -1; + want = min(end - index, + min((pgoff_t)PAGEVEC_SIZE, + max_pages - (pgoff_t)locked_pages) - 1) + + 1; + pvec_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY, + want); + dout("pagevec_lookup_tag got %d\n", pvec_pages); + if (!pvec_pages && !locked_pages) + break; + for (i = 0; i < pvec_pages && locked_pages < max_pages; i++) { + page = pvec.pages[i]; + dout("? %p idx %lu\n", page, page->index); + if (locked_pages == 0) + lock_page(page); /* first page */ + else if (!trylock_page(page)) + break; + + /* only dirty pages, or our accounting breaks */ + if (unlikely(!PageDirty(page)) || + unlikely(page->mapping != mapping)) { + dout("!dirty or !mapping %p\n", page); + unlock_page(page); + break; + } + if (!wbc->range_cyclic && page->index > end) { + dout("end of range %p\n", page); + done = 1; + unlock_page(page); + break; + } + if (next && (page->index != next)) { + dout("not consecutive %p\n", page); + unlock_page(page); + break; + } + if (wbc->sync_mode != WB_SYNC_NONE) { + dout("waiting on writeback %p\n", page); + wait_on_page_writeback(page); + } + if ((snap_size && page_offset(page) > snap_size) || + (!snap_size && + page_offset(page) > i_size_read(inode))) { + dout("%p page eof %llu\n", page, snap_size ? + snap_size : i_size_read(inode)); + done = 1; + unlock_page(page); + break; + } + if (PageWriteback(page)) { + dout("%p under writeback\n", page); + unlock_page(page); + break; + } + + /* only if matching snap context */ + if (snapc != (void *)page->private) { + dout("page snapc %p != oldest %p\n", + (void *)page->private, snapc); + unlock_page(page); + if (!locked_pages) + continue; /* keep looking for snap */ + break; + } + + if (!clear_page_dirty_for_io(page)) { + dout("%p !clear_page_dirty_for_io\n", page); + unlock_page(page); + break; + } + + /* ok */ + if (locked_pages == 0) { + /* prepare async write request */ + offset = page->index << PAGE_CACHE_SHIFT; + len = wsize; + req = ceph_osdc_new_request(&client->osdc, + &ci->i_layout, + ceph_vino(inode), + offset, &len, + CEPH_OSD_OP_WRITE, + CEPH_OSD_FLAG_WRITE | + CEPH_OSD_FLAG_ONDISK, + snapc, do_sync, + ci->i_truncate_seq, + ci->i_truncate_size, + &inode->i_mtime, true, 1); + max_pages = req->r_num_pages; + + alloc_page_vec(client, req); + req->r_callback = writepages_finish; + req->r_inode = inode; + req->r_wbc = wbc; + } + + /* note position of first page in pvec */ + if (first < 0) + first = i; + dout("%p will write page %p idx %lu\n", + inode, page, page->index); + + writeback_stat = atomic_long_inc_return(&client->writeback_count); + if (writeback_stat > CONGESTION_ON_THRESH(client->mount_args->congestion_kb)) { + set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC); + } + + set_page_writeback(page); + req->r_pages[locked_pages] = page; + locked_pages++; + next = page->index + 1; + } + + /* did we get anything? */ + if (!locked_pages) + goto release_pvec_pages; + if (i) { + int j; + BUG_ON(!locked_pages || first < 0); + + if (pvec_pages && i == pvec_pages && + locked_pages < max_pages) { + dout("reached end pvec, trying for more\n"); + pagevec_reinit(&pvec); + goto get_more_pages; + } + + /* shift unused pages over in the pvec... we + * will need to release them below. */ + for (j = i; j < pvec_pages; j++) { + dout(" pvec leftover page %p\n", + pvec.pages[j]); + pvec.pages[j-i+first] = pvec.pages[j]; + } + pvec.nr -= i-first; + } + + /* submit the write */ + offset = req->r_pages[0]->index << PAGE_CACHE_SHIFT; + len = min((snap_size ? snap_size : i_size_read(inode)) - offset, + (u64)locked_pages << PAGE_CACHE_SHIFT); + dout("writepages got %d pages at %llu~%llu\n", + locked_pages, offset, len); + + /* revise final length, page count */ + req->r_num_pages = locked_pages; + reqhead = req->r_request->front.iov_base; + op = (void *)(reqhead + 1); + op->extent.length = cpu_to_le64(len); + op->payload_len = cpu_to_le32(len); + req->r_request->hdr.data_len = cpu_to_le32(len); + + ceph_osdc_start_request(&client->osdc, req, true); + req = NULL; + + /* continue? */ + index = next; + wbc->nr_to_write -= locked_pages; + if (wbc->nr_to_write <= 0) + done = 1; + +release_pvec_pages: + dout("pagevec_release on %d pages (%p)\n", (int)pvec.nr, + pvec.nr ? pvec.pages[0] : NULL); + pagevec_release(&pvec); + + if (locked_pages && !done) + goto retry; + } + + if (should_loop && !done) { + /* more to do; loop back to beginning of file */ + dout("writepages looping back to beginning of file\n"); + should_loop = 0; + index = 0; + goto retry; + } + + if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) + mapping->writeback_index = index; + +out: + if (req) + ceph_osdc_put_request(req); + if (rc > 0) + rc = 0; /* vfs expects us to return 0 */ + ceph_put_snap_context(snapc); + dout("writepages done, rc = %d\n", rc); +out_final: + return rc; +} + + + +/* + * See if a given @snapc is either writeable, or already written. + */ +static int context_is_writeable_or_written(struct inode *inode, + struct ceph_snap_context *snapc) +{ + struct ceph_snap_context *oldest = get_oldest_context(inode, NULL); + return !oldest || snapc->seq <= oldest->seq; +} + +/* + * We are only allowed to write into/dirty the page if the page is + * clean, or already dirty within the same snap context. + */ +static int ceph_update_writeable_page(struct file *file, + loff_t pos, unsigned len, + struct page *page) +{ + struct inode *inode = file->f_dentry->d_inode; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc; + loff_t page_off = pos & PAGE_CACHE_MASK; + int pos_in_page = pos & ~PAGE_CACHE_MASK; + int end_in_page = pos_in_page + len; + loff_t i_size; + struct ceph_snap_context *snapc; + int r; + +retry_locked: + /* writepages currently holds page lock, but if we change that later, */ + wait_on_page_writeback(page); + + /* check snap context */ + BUG_ON(!ci->i_snap_realm); + down_read(&mdsc->snap_rwsem); + BUG_ON(!ci->i_snap_realm->cached_context); + if (page->private && + (void *)page->private != ci->i_snap_realm->cached_context) { + /* + * this page is already dirty in another (older) snap + * context! is it writeable now? + */ + snapc = get_oldest_context(inode, NULL); + up_read(&mdsc->snap_rwsem); + + if (snapc != (void *)page->private) { + dout(" page %p snapc %p not current or oldest\n", + page, (void *)page->private); + /* + * queue for writeback, and wait for snapc to + * be writeable or written + */ + snapc = ceph_get_snap_context((void *)page->private); + unlock_page(page); + ceph_queue_writeback(inode); + wait_event_interruptible(ci->i_cap_wq, + context_is_writeable_or_written(inode, snapc)); + ceph_put_snap_context(snapc); + return -EAGAIN; + } + + /* yay, writeable, do it now (without dropping page lock) */ + dout(" page %p snapc %p not current, but oldest\n", + page, snapc); + if (!clear_page_dirty_for_io(page)) + goto retry_locked; + r = writepage_nounlock(page, NULL); + if (r < 0) + goto fail_nosnap; + goto retry_locked; + } + + if (PageUptodate(page)) { + dout(" page %p already uptodate\n", page); + return 0; + } + + /* full page? */ + if (pos_in_page == 0 && len == PAGE_CACHE_SIZE) + return 0; + + /* past end of file? */ + i_size = inode->i_size; /* caller holds i_mutex */ + + if (i_size + len > inode->i_sb->s_maxbytes) { + /* file is too big */ + r = -EINVAL; + goto fail; + } + + if (page_off >= i_size || + (pos_in_page == 0 && (pos+len) >= i_size && + end_in_page - pos_in_page != PAGE_CACHE_SIZE)) { + dout(" zeroing %p 0 - %d and %d - %d\n", + page, pos_in_page, end_in_page, (int)PAGE_CACHE_SIZE); + zero_user_segments(page, + 0, pos_in_page, + end_in_page, PAGE_CACHE_SIZE); + return 0; + } + + /* we need to read it. */ + up_read(&mdsc->snap_rwsem); + r = readpage_nounlock(file, page); + if (r < 0) + goto fail_nosnap; + goto retry_locked; + +fail: + up_read(&mdsc->snap_rwsem); +fail_nosnap: + unlock_page(page); + return r; +} + +/* + * We are only allowed to write into/dirty the page if the page is + * clean, or already dirty within the same snap context. + */ +static int ceph_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + struct inode *inode = file->f_dentry->d_inode; + struct page *page; + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + int r; + + do { + /* get a page*/ + page = grab_cache_page_write_begin(mapping, index, 0); + if (!page) + return -ENOMEM; + *pagep = page; + + dout("write_begin file %p inode %p page %p %d~%d\n", file, + inode, page, (int)pos, (int)len); + + r = ceph_update_writeable_page(file, pos, len, page); + } while (r == -EAGAIN); + + return r; +} + +/* + * we don't do anything in here that simple_write_end doesn't do + * except adjust dirty page accounting and drop read lock on + * mdsc->snap_rwsem. + */ +static int ceph_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = file->f_dentry->d_inode; + struct ceph_client *client = ceph_inode_to_client(inode); + struct ceph_mds_client *mdsc = &client->mdsc; + unsigned from = pos & (PAGE_CACHE_SIZE - 1); + int check_cap = 0; + + dout("write_end file %p inode %p page %p %d~%d (%d)\n", file, + inode, page, (int)pos, (int)copied, (int)len); + + /* zero the stale part of the page if we did a short copy */ + if (copied < len) + zero_user_segment(page, from+copied, len); + + /* did file size increase? */ + /* (no need for i_size_read(); we caller holds i_mutex */ + if (pos+copied > inode->i_size) + check_cap = ceph_inode_set_size(inode, pos+copied); + + if (!PageUptodate(page)) + SetPageUptodate(page); + + set_page_dirty(page); + + unlock_page(page); + up_read(&mdsc->snap_rwsem); + page_cache_release(page); + + if (check_cap) + ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL); + + return copied; +} + +/* + * we set .direct_IO to indicate direct io is supported, but since we + * intercept O_DIRECT reads and writes early, this function should + * never get called. + */ +static ssize_t ceph_direct_io(int rw, struct kiocb *iocb, + const struct iovec *iov, + loff_t pos, unsigned long nr_segs) +{ + WARN_ON(1); + return -EINVAL; +} + +const struct address_space_operations ceph_aops = { + .readpage = ceph_readpage, + .readpages = ceph_readpages, + .writepage = ceph_writepage, + .writepages = ceph_writepages_start, + .write_begin = ceph_write_begin, + .write_end = ceph_write_end, + .set_page_dirty = ceph_set_page_dirty, + .invalidatepage = ceph_invalidatepage, + .releasepage = ceph_releasepage, + .direct_IO = ceph_direct_io, +}; + + +/* + * vm ops + */ + +/* + * Reuse write_begin here for simplicity. + */ +static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct inode *inode = vma->vm_file->f_dentry->d_inode; + struct page *page = vmf->page; + struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc; + loff_t off = page->index << PAGE_CACHE_SHIFT; + loff_t size, len; + int ret; + + size = i_size_read(inode); + if (off + PAGE_CACHE_SIZE <= size) + len = PAGE_CACHE_SIZE; + else + len = size & ~PAGE_CACHE_MASK; + + dout("page_mkwrite %p %llu~%llu page %p idx %lu\n", inode, + off, len, page, page->index); + + lock_page(page); + + ret = VM_FAULT_NOPAGE; + if ((off > size) || + (page->mapping != inode->i_mapping)) + goto out; + + ret = ceph_update_writeable_page(vma->vm_file, off, len, page); + if (ret == 0) { + /* success. we'll keep the page locked. */ + set_page_dirty(page); + up_read(&mdsc->snap_rwsem); + ret = VM_FAULT_LOCKED; + } else { + if (ret == -ENOMEM) + ret = VM_FAULT_OOM; + else + ret = VM_FAULT_SIGBUS; + } +out: + dout("page_mkwrite %p %llu~%llu = %d\n", inode, off, len, ret); + if (ret != VM_FAULT_LOCKED) + unlock_page(page); + return ret; +} + +static struct vm_operations_struct ceph_vmops = { + .fault = filemap_fault, + .page_mkwrite = ceph_page_mkwrite, +}; + +int ceph_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct address_space *mapping = file->f_mapping; + + if (!mapping->a_ops->readpage) + return -ENOEXEC; + file_accessed(file); + vma->vm_ops = &ceph_vmops; + vma->vm_flags |= VM_CAN_NONLINEAR; + return 0; +} diff --git a/fs/ceph/armor.c b/fs/ceph/armor.c new file mode 100644 index 00000000000..67b2c030924 --- /dev/null +++ b/fs/ceph/armor.c @@ -0,0 +1,99 @@ + +#include <linux/errno.h> + +/* + * base64 encode/decode. + */ + +const char *pem_key = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; + +static int encode_bits(int c) +{ + return pem_key[c]; +} + +static int decode_bits(char c) +{ + if (c >= 'A' && c <= 'Z') + return c - 'A'; + if (c >= 'a' && c <= 'z') + return c - 'a' + 26; + if (c >= '0' && c <= '9') + return c - '0' + 52; + if (c == '+') + return 62; + if (c == '/') + return 63; + if (c == '=') + return 0; /* just non-negative, please */ + return -EINVAL; +} + +int ceph_armor(char *dst, const char *src, const char *end) +{ + int olen = 0; + int line = 0; + + while (src < end) { + unsigned char a, b, c; + + a = *src++; + *dst++ = encode_bits(a >> 2); + if (src < end) { + b = *src++; + *dst++ = encode_bits(((a & 3) << 4) | (b >> 4)); + if (src < end) { + c = *src++; + *dst++ = encode_bits(((b & 15) << 2) | + (c >> 6)); + *dst++ = encode_bits(c & 63); + } else { + *dst++ = encode_bits((b & 15) << 2); + *dst++ = '='; + } + } else { + *dst++ = encode_bits(((a & 3) << 4)); + *dst++ = '='; + *dst++ = '='; + } + olen += 4; + line += 4; + if (line == 64) { + line = 0; + *(dst++) = '\n'; + olen++; + } + } + return olen; +} + +int ceph_unarmor(char *dst, const char *src, const char *end) +{ + int olen = 0; + + while (src < end) { + int a, b, c, d; + + if (src < end && src[0] == '\n') + src++; + if (src + 4 > end) + return -EINVAL; + a = decode_bits(src[0]); + b = decode_bits(src[1]); + c = decode_bits(src[2]); + d = decode_bits(src[3]); + if (a < 0 || b < 0 || c < 0 || d < 0) + return -EINVAL; + + *dst++ = (a << 2) | (b >> 4); + if (src[2] == '=') + return olen + 1; + *dst++ = ((b & 15) << 4) | (c >> 2); + if (src[3] == '=') + return olen + 2; + *dst++ = ((c & 3) << 6) | d; + olen += 3; + src += 4; + } + return olen; +} diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c new file mode 100644 index 00000000000..abb204fea6c --- /dev/null +++ b/fs/ceph/auth.c @@ -0,0 +1,257 @@ +#include "ceph_debug.h" + +#include <linux/module.h> +#include <linux/err.h> + +#include "types.h" +#include "auth_none.h" +#include "auth_x.h" +#include "decode.h" +#include "super.h" + +#include "messenger.h" + +/* + * get protocol handler + */ +static u32 supported_protocols[] = { + CEPH_AUTH_NONE, + CEPH_AUTH_CEPHX +}; + +int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol) +{ + switch (protocol) { + case CEPH_AUTH_NONE: + return ceph_auth_none_init(ac); + case CEPH_AUTH_CEPHX: + return ceph_x_init(ac); + default: + return -ENOENT; + } +} + +/* + * setup, teardown. + */ +struct ceph_auth_client *ceph_auth_init(const char *name, const char *secret) +{ + struct ceph_auth_client *ac; + int ret; + + dout("auth_init name '%s' secret '%s'\n", name, secret); + + ret = -ENOMEM; + ac = kzalloc(sizeof(*ac), GFP_NOFS); + if (!ac) + goto out; + + ac->negotiating = true; + if (name) + ac->name = name; + else + ac->name = CEPH_AUTH_NAME_DEFAULT; + dout("auth_init name %s secret %s\n", ac->name, secret); + ac->secret = secret; + return ac; + +out: + return ERR_PTR(ret); +} + +void ceph_auth_destroy(struct ceph_auth_client *ac) +{ + dout("auth_destroy %p\n", ac); + if (ac->ops) + ac->ops->destroy(ac); + kfree(ac); +} + +/* + * Reset occurs when reconnecting to the monitor. + */ +void ceph_auth_reset(struct ceph_auth_client *ac) +{ + dout("auth_reset %p\n", ac); + if (ac->ops && !ac->negotiating) + ac->ops->reset(ac); + ac->negotiating = true; +} + +int ceph_entity_name_encode(const char *name, void **p, void *end) +{ + int len = strlen(name); + + if (*p + 2*sizeof(u32) + len > end) + return -ERANGE; + ceph_encode_32(p, CEPH_ENTITY_TYPE_CLIENT); + ceph_encode_32(p, len); + ceph_encode_copy(p, name, len); + return 0; +} + +/* + * Initiate protocol negotiation with monitor. Include entity name + * and list supported protocols. + */ +int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len) +{ + struct ceph_mon_request_header *monhdr = buf; + void *p = monhdr + 1, *end = buf + len, *lenp; + int i, num; + int ret; + + dout("auth_build_hello\n"); + monhdr->have_version = 0; + monhdr->session_mon = cpu_to_le16(-1); + monhdr->session_mon_tid = 0; + + ceph_encode_32(&p, 0); /* no protocol, yet */ + + lenp = p; + p += sizeof(u32); + + ceph_decode_need(&p, end, 1 + sizeof(u32), bad); + ceph_encode_8(&p, 1); + num = ARRAY_SIZE(supported_protocols); + ceph_encode_32(&p, num); + ceph_decode_need(&p, end, num * sizeof(u32), bad); + for (i = 0; i < num; i++) + ceph_encode_32(&p, supported_protocols[i]); + + ret = ceph_entity_name_encode(ac->name, &p, end); + if (ret < 0) + return ret; + ceph_decode_need(&p, end, sizeof(u64), bad); + ceph_encode_64(&p, ac->global_id); + + ceph_encode_32(&lenp, p - lenp - sizeof(u32)); + return p - buf; + +bad: + return -ERANGE; +} + +int ceph_build_auth_request(struct ceph_auth_client *ac, + void *msg_buf, size_t msg_len) +{ + struct ceph_mon_request_header *monhdr = msg_buf; + void *p = monhdr + 1; + void *end = msg_buf + msg_len; + int ret; + + monhdr->have_version = 0; + monhdr->session_mon = cpu_to_le16(-1); + monhdr->session_mon_tid = 0; + + ceph_encode_32(&p, ac->protocol); + + ret = ac->ops->build_request(ac, p + sizeof(u32), end); + if (ret < 0) { + pr_err("error %d building request\n", ret); + return ret; + } + dout(" built request %d bytes\n", ret); + ceph_encode_32(&p, ret); + return p + ret - msg_buf; +} + +/* + * Handle auth message from monitor. + */ +int ceph_handle_auth_reply(struct ceph_auth_client *ac, + void *buf, size_t len, + void *reply_buf, size_t reply_len) +{ + void *p = buf; + void *end = buf + len; + int protocol; + s32 result; + u64 global_id; + void *payload, *payload_end; + int payload_len; + char *result_msg; + int result_msg_len; + int ret = -EINVAL; + + dout("handle_auth_reply %p %p\n", p, end); + ceph_decode_need(&p, end, sizeof(u32) * 3 + sizeof(u64), bad); + protocol = ceph_decode_32(&p); + result = ceph_decode_32(&p); + global_id = ceph_decode_64(&p); + payload_len = ceph_decode_32(&p); + payload = p; + p += payload_len; + ceph_decode_need(&p, end, sizeof(u32), bad); + result_msg_len = ceph_decode_32(&p); + result_msg = p; + p += result_msg_len; + if (p != end) + goto bad; + + dout(" result %d '%.*s' gid %llu len %d\n", result, result_msg_len, + result_msg, global_id, payload_len); + + payload_end = payload + payload_len; + + if (global_id && ac->global_id != global_id) { + dout(" set global_id %lld -> %lld\n", ac->global_id, global_id); + ac->global_id = global_id; + } + + if (ac->negotiating) { + /* server does not support our protocols? */ + if (!protocol && result < 0) { + ret = result; + goto out; + } + /* set up (new) protocol handler? */ + if (ac->protocol && ac->protocol != protocol) { + ac->ops->destroy(ac); + ac->protocol = 0; + ac->ops = NULL; + } + if (ac->protocol != protocol) { + ret = ceph_auth_init_protocol(ac, protocol); + if (ret) { + pr_err("error %d on auth protocol %d init\n", + ret, protocol); + goto out; + } + } + + ac->negotiating = false; + } + + ret = ac->ops->handle_reply(ac, result, payload, payload_end); + if (ret == -EAGAIN) { + return ceph_build_auth_request(ac, reply_buf, reply_len); + } else if (ret) { + pr_err("authentication error %d\n", ret); + return ret; + } + return 0; + +bad: + pr_err("failed to decode auth msg\n"); +out: + return ret; +} + +int ceph_build_auth(struct ceph_auth_client *ac, + void *msg_buf, size_t msg_len) +{ + if (!ac->protocol) + return ceph_auth_build_hello(ac, msg_buf, msg_len); + BUG_ON(!ac->ops); + if (!ac->ops->is_authenticated(ac)) + return ceph_build_auth_request(ac, msg_buf, msg_len); + return 0; +} + +int ceph_auth_is_authenticated(struct ceph_auth_client *ac) +{ + if (!ac->ops) + return 0; + return ac->ops->is_authenticated(ac); +} diff --git a/fs/ceph/auth.h b/fs/ceph/auth.h new file mode 100644 index 00000000000..ca4f57cfb26 --- /dev/null +++ b/fs/ceph/auth.h @@ -0,0 +1,84 @@ +#ifndef _FS_CEPH_AUTH_H +#define _FS_CEPH_AUTH_H + +#include "types.h" +#include "buffer.h" + +/* + * Abstract interface for communicating with the authenticate module. + * There is some handshake that takes place between us and the monitor + * to acquire the necessary keys. These are used to generate an + * 'authorizer' that we use when connecting to a service (mds, osd). + */ + +struct ceph_auth_client; +struct ceph_authorizer; + +struct ceph_auth_client_ops { + /* + * true if we are authenticated and can connect to + * services. + */ + int (*is_authenticated)(struct ceph_auth_client *ac); + + /* + * build requests and process replies during monitor + * handshake. if handle_reply returns -EAGAIN, we build + * another request. + */ + int (*build_request)(struct ceph_auth_client *ac, void *buf, void *end); + int (*handle_reply)(struct ceph_auth_client *ac, int result, + void *buf, void *end); + + /* + * Create authorizer for connecting to a service, and verify + * the response to authenticate the service. + */ + int (*create_authorizer)(struct ceph_auth_client *ac, int peer_type, + struct ceph_authorizer **a, + void **buf, size_t *len, + void **reply_buf, size_t *reply_len); + int (*verify_authorizer_reply)(struct ceph_auth_client *ac, + struct ceph_authorizer *a, size_t len); + void (*destroy_authorizer)(struct ceph_auth_client *ac, + struct ceph_authorizer *a); + void (*invalidate_authorizer)(struct ceph_auth_client *ac, + int peer_type); + + /* reset when we (re)connect to a monitor */ + void (*reset)(struct ceph_auth_client *ac); + + void (*destroy)(struct ceph_auth_client *ac); +}; + +struct ceph_auth_client { + u32 protocol; /* CEPH_AUTH_* */ + void *private; /* for use by protocol implementation */ + const struct ceph_auth_client_ops *ops; /* null iff protocol==0 */ + + bool negotiating; /* true if negotiating protocol */ + const char *name; /* entity name */ + u64 global_id; /* our unique id in system */ + const char *secret; /* our secret key */ + unsigned want_keys; /* which services we want */ +}; + +extern struct ceph_auth_client *ceph_auth_init(const char *name, + const char *secret); +extern void ceph_auth_destroy(struct ceph_auth_client *ac); + +extern void ceph_auth_reset(struct ceph_auth_client *ac); + +extern int ceph_auth_build_hello(struct ceph_auth_client *ac, + void *buf, size_t len); +extern int ceph_handle_auth_reply(struct ceph_auth_client *ac, + void *buf, size_t len, + void *reply_buf, size_t reply_len); +extern int ceph_entity_name_encode(const char *name, void **p, void *end); + +extern int ceph_build_auth(struct ceph_auth_client *ac, + void *msg_buf, size_t msg_len); + +extern int ceph_auth_is_authenticated(struct ceph_auth_client *ac); + +#endif diff --git a/fs/ceph/auth_none.c b/fs/ceph/auth_none.c new file mode 100644 index 00000000000..b4ef6f0a6c8 --- /dev/null +++ b/fs/ceph/auth_none.c @@ -0,0 +1,121 @@ + +#include "ceph_debug.h" + +#include <linux/err.h> +#include <linux/module.h> +#include <linux/random.h> + +#include "auth_none.h" +#include "auth.h" +#include "decode.h" + +static void reset(struct ceph_auth_client *ac) +{ + struct ceph_auth_none_info *xi = ac->private; + + xi->starting = true; + xi->built_authorizer = false; +} + +static void destroy(struct ceph_auth_client *ac) +{ + kfree(ac->private); + ac->private = NULL; +} + +static int is_authenticated(struct ceph_auth_client *ac) +{ + struct ceph_auth_none_info *xi = ac->private; + + return !xi->starting; +} + +/* + * the generic auth code decode the global_id, and we carry no actual + * authenticate state, so nothing happens here. + */ +static int handle_reply(struct ceph_auth_client *ac, int result, + void *buf, void *end) +{ + struct ceph_auth_none_info *xi = ac->private; + + xi->starting = false; + return result; +} + +/* + * build an 'authorizer' with our entity_name and global_id. we can + * reuse a single static copy since it is identical for all services + * we connect to. + */ +static int ceph_auth_none_create_authorizer( + struct ceph_auth_client *ac, int peer_type, + struct ceph_authorizer **a, + void **buf, size_t *len, + void **reply_buf, size_t *reply_len) +{ + struct ceph_auth_none_info *ai = ac->private; + struct ceph_none_authorizer *au = &ai->au; + void *p, *end; + int ret; + + if (!ai->built_authorizer) { + p = au->buf; + end = p + sizeof(au->buf); + ceph_encode_8(&p, 1); + ret = ceph_entity_name_encode(ac->name, &p, end - 8); + if (ret < 0) + goto bad; + ceph_decode_need(&p, end, sizeof(u64), bad2); + ceph_encode_64(&p, ac->global_id); + au->buf_len = p - (void *)au->buf; + ai->built_authorizer = true; + dout("built authorizer len %d\n", au->buf_len); + } + + *a = (struct ceph_authorizer *)au; + *buf = au->buf; + *len = au->buf_len; + *reply_buf = au->reply_buf; + *reply_len = sizeof(au->reply_buf); + return 0; + +bad2: + ret = -ERANGE; +bad: + return ret; +} + +static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac, + struct ceph_authorizer *a) +{ + /* nothing to do */ +} + +static const struct ceph_auth_client_ops ceph_auth_none_ops = { + .reset = reset, + .destroy = destroy, + .is_authenticated = is_authenticated, + .handle_reply = handle_reply, + .create_authorizer = ceph_auth_none_create_authorizer, + .destroy_authorizer = ceph_auth_none_destroy_authorizer, +}; + +int ceph_auth_none_init(struct ceph_auth_client *ac) +{ + struct ceph_auth_none_info *xi; + + dout("ceph_auth_none_init %p\n", ac); + xi = kzalloc(sizeof(*xi), GFP_NOFS); + if (!xi) + return -ENOMEM; + + xi->starting = true; + xi->built_authorizer = false; + + ac->protocol = CEPH_AUTH_NONE; + ac->private = xi; + ac->ops = &ceph_auth_none_ops; + return 0; +} + diff --git a/fs/ceph/auth_none.h b/fs/ceph/auth_none.h new file mode 100644 index 00000000000..56c05533a31 --- /dev/null +++ b/fs/ceph/auth_none.h @@ -0,0 +1,28 @@ +#ifndef _FS_CEPH_AUTH_NONE_H +#define _FS_CEPH_AUTH_NONE_H + +#include "auth.h" + +/* + * null security mode. + * + * we use a single static authorizer that simply encodes our entity name + * and global id. + */ + +struct ceph_none_authorizer { + char buf[128]; + int buf_len; + char reply_buf[0]; +}; + +struct ceph_auth_none_info { + bool starting; + bool built_authorizer; + struct ceph_none_authorizer au; /* we only need one; it's static */ +}; + +extern int ceph_auth_none_init(struct ceph_auth_client *ac); + +#endif + diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c new file mode 100644 index 00000000000..f0318427b6d --- /dev/null +++ b/fs/ceph/auth_x.c @@ -0,0 +1,656 @@ + +#include "ceph_debug.h" + +#include <linux/err.h> +#include <linux/module.h> +#include <linux/random.h> + +#include "auth_x.h" +#include "auth_x_protocol.h" +#include "crypto.h" +#include "auth.h" +#include "decode.h" + +struct kmem_cache *ceph_x_ticketbuf_cachep; + +#define TEMP_TICKET_BUF_LEN 256 + +static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed); + +static int ceph_x_is_authenticated(struct ceph_auth_client *ac) +{ + struct ceph_x_info *xi = ac->private; + int need; + + ceph_x_validate_tickets(ac, &need); + dout("ceph_x_is_authenticated want=%d need=%d have=%d\n", + ac->want_keys, need, xi->have_keys); + return (ac->want_keys & xi->have_keys) == ac->want_keys; +} + +static int ceph_x_encrypt(struct ceph_crypto_key *secret, + void *ibuf, int ilen, void *obuf, size_t olen) +{ + struct ceph_x_encrypt_header head = { + .struct_v = 1, + .magic = cpu_to_le64(CEPHX_ENC_MAGIC) + }; + size_t len = olen - sizeof(u32); + int ret; + + ret = ceph_encrypt2(secret, obuf + sizeof(u32), &len, + &head, sizeof(head), ibuf, ilen); + if (ret) + return ret; + ceph_encode_32(&obuf, len); + return len + sizeof(u32); +} + +static int ceph_x_decrypt(struct ceph_crypto_key *secret, + void **p, void *end, void *obuf, size_t olen) +{ + struct ceph_x_encrypt_header head; + size_t head_len = sizeof(head); + int len, ret; + + len = ceph_decode_32(p); + if (*p + len > end) + return -EINVAL; + + dout("ceph_x_decrypt len %d\n", len); + ret = ceph_decrypt2(secret, &head, &head_len, obuf, &olen, + *p, len); + if (ret) + return ret; + if (head.struct_v != 1 || le64_to_cpu(head.magic) != CEPHX_ENC_MAGIC) + return -EPERM; + *p += len; + return olen; +} + +/* + * get existing (or insert new) ticket handler + */ +struct ceph_x_ticket_handler *get_ticket_handler(struct ceph_auth_client *ac, + int service) +{ + struct ceph_x_ticket_handler *th; + struct ceph_x_info *xi = ac->private; + struct rb_node *parent = NULL, **p = &xi->ticket_handlers.rb_node; + + while (*p) { + parent = *p; + th = rb_entry(parent, struct ceph_x_ticket_handler, node); + if (service < th->service) + p = &(*p)->rb_left; + else if (service > th->service) + p = &(*p)->rb_right; + else + return th; + } + + /* add it */ + th = kzalloc(sizeof(*th), GFP_NOFS); + if (!th) + return ERR_PTR(-ENOMEM); + th->service = service; + rb_link_node(&th->node, parent, p); + rb_insert_color(&th->node, &xi->ticket_handlers); + return th; +} + +static void remove_ticket_handler(struct ceph_auth_client *ac, + struct ceph_x_ticket_handler *th) +{ + struct ceph_x_info *xi = ac->private; + + dout("remove_ticket_handler %p %d\n", th, th->service); + rb_erase(&th->node, &xi->ticket_handlers); + ceph_crypto_key_destroy(&th->session_key); + if (th->ticket_blob) + ceph_buffer_put(th->ticket_blob); + kfree(th); +} + +static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac, + struct ceph_crypto_key *secret, + void *buf, void *end) +{ + struct ceph_x_info *xi = ac->private; + int num; + void *p = buf; + int ret; + char *dbuf; + char *ticket_buf; + u8 struct_v; + + dbuf = kmem_cache_alloc(ceph_x_ticketbuf_cachep, GFP_NOFS | GFP_ATOMIC); + if (!dbuf) + return -ENOMEM; + + ret = -ENOMEM; + ticket_buf = kmem_cache_alloc(ceph_x_ticketbuf_cachep, + GFP_NOFS | GFP_ATOMIC); + if (!ticket_buf) + goto out_dbuf; + + ceph_decode_need(&p, end, 1 + sizeof(u32), bad); + struct_v = ceph_decode_8(&p); + if (struct_v != 1) + goto bad; + num = ceph_decode_32(&p); + dout("%d tickets\n", num); + while (num--) { + int type; + u8 struct_v; + struct ceph_x_ticket_handler *th; + void *dp, *dend; + int dlen; + char is_enc; + struct timespec validity; + struct ceph_crypto_key old_key; + void *tp, *tpend; + + ceph_decode_need(&p, end, sizeof(u32) + 1, bad); + + type = ceph_decode_32(&p); + dout(" ticket type %d %s\n", type, ceph_entity_type_name(type)); + + struct_v = ceph_decode_8(&p); + if (struct_v != 1) + goto bad; + + th = get_ticket_handler(ac, type); + if (IS_ERR(th)) { + ret = PTR_ERR(th); + goto out; + } + + /* blob for me */ + dlen = ceph_x_decrypt(secret, &p, end, dbuf, + TEMP_TICKET_BUF_LEN); + if (dlen <= 0) { + ret = dlen; + goto out; + } + dout(" decrypted %d bytes\n", dlen); + dend = dbuf + dlen; + dp = dbuf; + + struct_v = ceph_decode_8(&dp); + if (struct_v != 1) + goto bad; + + memcpy(&old_key, &th->session_key, sizeof(old_key)); + ret = ceph_crypto_key_decode(&th->session_key, &dp, dend); + if (ret) + goto out; + + ceph_decode_copy(&dp, &th->validity, sizeof(th->validity)); + ceph_decode_timespec(&validity, &th->validity); + th->expires = get_seconds() + validity.tv_sec; + th->renew_after = th->expires - (validity.tv_sec / 4); + dout(" expires=%lu renew_after=%lu\n", th->expires, + th->renew_after); + + /* ticket blob for service */ + ceph_decode_8_safe(&p, end, is_enc, bad); + tp = ticket_buf; + if (is_enc) { + /* encrypted */ + dout(" encrypted ticket\n"); + dlen = ceph_x_decrypt(&old_key, &p, end, ticket_buf, + TEMP_TICKET_BUF_LEN); + if (dlen < 0) { + ret = dlen; + goto out; + } + dlen = ceph_decode_32(&tp); + } else { + /* unencrypted */ + ceph_decode_32_safe(&p, end, dlen, bad); + ceph_decode_need(&p, end, dlen, bad); + ceph_decode_copy(&p, ticket_buf, dlen); + } + tpend = tp + dlen; + dout(" ticket blob is %d bytes\n", dlen); + ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad); + struct_v = ceph_decode_8(&tp); + th->secret_id = ceph_decode_64(&tp); + ret = ceph_decode_buffer(&th->ticket_blob, &tp, tpend); + if (ret) + goto out; + dout(" got ticket service %d (%s) secret_id %lld len %d\n", + type, ceph_entity_type_name(type), th->secret_id, + (int)th->ticket_blob->vec.iov_len); + xi->have_keys |= th->service; + } + + ret = 0; +out: + kmem_cache_free(ceph_x_ticketbuf_cachep, ticket_buf); +out_dbuf: + kmem_cache_free(ceph_x_ticketbuf_cachep, dbuf); + return ret; + +bad: + ret = -EINVAL; + goto out; +} + +static int ceph_x_build_authorizer(struct ceph_auth_client *ac, + struct ceph_x_ticket_handler *th, + struct ceph_x_authorizer *au) +{ + int len; + struct ceph_x_authorize_a *msg_a; + struct ceph_x_authorize_b msg_b; + void *p, *end; + int ret; + int ticket_blob_len = + (th->ticket_blob ? th->ticket_blob->vec.iov_len : 0); + + dout("build_authorizer for %s %p\n", + ceph_entity_type_name(th->service), au); + + len = sizeof(*msg_a) + sizeof(msg_b) + sizeof(u32) + + ticket_blob_len + 16; + dout(" need len %d\n", len); + if (au->buf && au->buf->alloc_len < len) { + ceph_buffer_put(au->buf); + au->buf = NULL; + } + if (!au->buf) { + au->buf = ceph_buffer_new(len, GFP_NOFS); + if (!au->buf) + return -ENOMEM; + } + au->service = th->service; + + msg_a = au->buf->vec.iov_base; + msg_a->struct_v = 1; + msg_a->global_id = cpu_to_le64(ac->global_id); + msg_a->service_id = cpu_to_le32(th->service); + msg_a->ticket_blob.struct_v = 1; + msg_a->ticket_blob.secret_id = cpu_to_le64(th->secret_id); + msg_a->ticket_blob.blob_len = cpu_to_le32(ticket_blob_len); + if (ticket_blob_len) { + memcpy(msg_a->ticket_blob.blob, th->ticket_blob->vec.iov_base, + th->ticket_blob->vec.iov_len); + } + dout(" th %p secret_id %lld %lld\n", th, th->secret_id, + le64_to_cpu(msg_a->ticket_blob.secret_id)); + + p = msg_a + 1; + p += ticket_blob_len; + end = au->buf->vec.iov_base + au->buf->vec.iov_len; + + get_random_bytes(&au->nonce, sizeof(au->nonce)); + msg_b.struct_v = 1; + msg_b.nonce = cpu_to_le64(au->nonce); + ret = ceph_x_encrypt(&th->session_key, &msg_b, sizeof(msg_b), + p, end - p); + if (ret < 0) + goto out_buf; + p += ret; + au->buf->vec.iov_len = p - au->buf->vec.iov_base; + dout(" built authorizer nonce %llx len %d\n", au->nonce, + (int)au->buf->vec.iov_len); + return 0; + +out_buf: + ceph_buffer_put(au->buf); + au->buf = NULL; + return ret; +} + +static int ceph_x_encode_ticket(struct ceph_x_ticket_handler *th, + void **p, void *end) +{ + ceph_decode_need(p, end, 1 + sizeof(u64), bad); + ceph_encode_8(p, 1); + ceph_encode_64(p, th->secret_id); + if (th->ticket_blob) { + const char *buf = th->ticket_blob->vec.iov_base; + u32 len = th->ticket_blob->vec.iov_len; + + ceph_encode_32_safe(p, end, len, bad); + ceph_encode_copy_safe(p, end, buf, len, bad); + } else { + ceph_encode_32_safe(p, end, 0, bad); + } + + return 0; +bad: + return -ERANGE; +} + +static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed) +{ + int want = ac->want_keys; + struct ceph_x_info *xi = ac->private; + int service; + + *pneed = ac->want_keys & ~(xi->have_keys); + + for (service = 1; service <= want; service <<= 1) { + struct ceph_x_ticket_handler *th; + + if (!(ac->want_keys & service)) + continue; + + if (*pneed & service) + continue; + + th = get_ticket_handler(ac, service); + + if (!th) { + *pneed |= service; + continue; + } + + if (get_seconds() >= th->renew_after) + *pneed |= service; + if (get_seconds() >= th->expires) + xi->have_keys &= ~service; + } +} + + +static int ceph_x_build_request(struct ceph_auth_client *ac, + void *buf, void *end) +{ + struct ceph_x_info *xi = ac->private; + int need; + struct ceph_x_request_header *head = buf; + int ret; + struct ceph_x_ticket_handler *th = + get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH); + + ceph_x_validate_tickets(ac, &need); + + dout("build_request want %x have %x need %x\n", + ac->want_keys, xi->have_keys, need); + + if (need & CEPH_ENTITY_TYPE_AUTH) { + struct ceph_x_authenticate *auth = (void *)(head + 1); + void *p = auth + 1; + struct ceph_x_challenge_blob tmp; + char tmp_enc[40]; + u64 *u; + + if (p > end) + return -ERANGE; + + dout(" get_auth_session_key\n"); + head->op = cpu_to_le16(CEPHX_GET_AUTH_SESSION_KEY); + + /* encrypt and hash */ + get_random_bytes(&auth->client_challenge, sizeof(u64)); + tmp.client_challenge = auth->client_challenge; + tmp.server_challenge = cpu_to_le64(xi->server_challenge); + ret = ceph_x_encrypt(&xi->secret, &tmp, sizeof(tmp), + tmp_enc, sizeof(tmp_enc)); + if (ret < 0) + return ret; + + auth->struct_v = 1; + auth->key = 0; + for (u = (u64 *)tmp_enc; u + 1 <= (u64 *)(tmp_enc + ret); u++) + auth->key ^= *u; + dout(" server_challenge %llx client_challenge %llx key %llx\n", + xi->server_challenge, le64_to_cpu(auth->client_challenge), + le64_to_cpu(auth->key)); + + /* now encode the old ticket if exists */ + ret = ceph_x_encode_ticket(th, &p, end); + if (ret < 0) + return ret; + + return p - buf; + } + + if (need) { + void *p = head + 1; + struct ceph_x_service_ticket_request *req; + + if (p > end) + return -ERANGE; + head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY); + + BUG_ON(!th); + ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer); + if (ret) + return ret; + ceph_encode_copy(&p, xi->auth_authorizer.buf->vec.iov_base, + xi->auth_authorizer.buf->vec.iov_len); + + req = p; + req->keys = cpu_to_le32(need); + p += sizeof(*req); + return p - buf; + } + + return 0; +} + +static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result, + void *buf, void *end) +{ + struct ceph_x_info *xi = ac->private; + struct ceph_x_reply_header *head = buf; + struct ceph_x_ticket_handler *th; + int len = end - buf; + int op; + int ret; + + if (result) + return result; /* XXX hmm? */ + + if (xi->starting) { + /* it's a hello */ + struct ceph_x_server_challenge *sc = buf; + + if (len != sizeof(*sc)) + return -EINVAL; + xi->server_challenge = le64_to_cpu(sc->server_challenge); + dout("handle_reply got server challenge %llx\n", + xi->server_challenge); + xi->starting = false; + xi->have_keys &= ~CEPH_ENTITY_TYPE_AUTH; + return -EAGAIN; + } + + op = le32_to_cpu(head->op); + result = le32_to_cpu(head->result); + dout("handle_reply op %d result %d\n", op, result); + switch (op) { + case CEPHX_GET_AUTH_SESSION_KEY: + /* verify auth key */ + ret = ceph_x_proc_ticket_reply(ac, &xi->secret, + buf + sizeof(*head), end); + break; + + case CEPHX_GET_PRINCIPAL_SESSION_KEY: + th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH); + BUG_ON(!th); + ret = ceph_x_proc_ticket_reply(ac, &th->session_key, + buf + sizeof(*head), end); + break; + + default: + return -EINVAL; + } + if (ret) + return ret; + if (ac->want_keys == xi->have_keys) + return 0; + return -EAGAIN; +} + +static int ceph_x_create_authorizer( + struct ceph_auth_client *ac, int peer_type, + struct ceph_authorizer **a, + void **buf, size_t *len, + void **reply_buf, size_t *reply_len) +{ + struct ceph_x_authorizer *au; + struct ceph_x_ticket_handler *th; + int ret; + + th = get_ticket_handler(ac, peer_type); + if (IS_ERR(th)) + return PTR_ERR(th); + + au = kzalloc(sizeof(*au), GFP_NOFS); + if (!au) + return -ENOMEM; + + ret = ceph_x_build_authorizer(ac, th, au); + if (ret) { + kfree(au); + return ret; + } + + *a = (struct ceph_authorizer *)au; + *buf = au->buf->vec.iov_base; + *len = au->buf->vec.iov_len; + *reply_buf = au->reply_buf; + *reply_len = sizeof(au->reply_buf); + return 0; +} + +static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac, + struct ceph_authorizer *a, size_t len) +{ + struct ceph_x_authorizer *au = (void *)a; + struct ceph_x_ticket_handler *th; + int ret = 0; + struct ceph_x_authorize_reply reply; + void *p = au->reply_buf; + void *end = p + sizeof(au->reply_buf); + + th = get_ticket_handler(ac, au->service); + if (!th) + return -EIO; /* hrm! */ + ret = ceph_x_decrypt(&th->session_key, &p, end, &reply, sizeof(reply)); + if (ret < 0) + return ret; + if (ret != sizeof(reply)) + return -EPERM; + + if (au->nonce + 1 != le64_to_cpu(reply.nonce_plus_one)) + ret = -EPERM; + else + ret = 0; + dout("verify_authorizer_reply nonce %llx got %llx ret %d\n", + au->nonce, le64_to_cpu(reply.nonce_plus_one), ret); + return ret; +} + +static void ceph_x_destroy_authorizer(struct ceph_auth_client *ac, + struct ceph_authorizer *a) +{ + struct ceph_x_authorizer *au = (void *)a; + + ceph_buffer_put(au->buf); + kfree(au); +} + + +static void ceph_x_reset(struct ceph_auth_client *ac) +{ + struct ceph_x_info *xi = ac->private; + + dout("reset\n"); + xi->starting = true; + xi->server_challenge = 0; +} + +static void ceph_x_destroy(struct ceph_auth_client *ac) +{ + struct ceph_x_info *xi = ac->private; + struct rb_node *p; + + dout("ceph_x_destroy %p\n", ac); + ceph_crypto_key_destroy(&xi->secret); + + while ((p = rb_first(&xi->ticket_handlers)) != NULL) { + struct ceph_x_ticket_handler *th = + rb_entry(p, struct ceph_x_ticket_handler, node); + remove_ticket_handler(ac, th); + } + + kmem_cache_destroy(ceph_x_ticketbuf_cachep); + + kfree(ac->private); + ac->private = NULL; +} + +static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac, + int peer_type) +{ + struct ceph_x_ticket_handler *th; + + th = get_ticket_handler(ac, peer_type); + if (th && !IS_ERR(th)) + remove_ticket_handler(ac, th); +} + + +static const struct ceph_auth_client_ops ceph_x_ops = { + .is_authenticated = ceph_x_is_authenticated, + .build_request = ceph_x_build_request, + .handle_reply = ceph_x_handle_reply, + .create_authorizer = ceph_x_create_authorizer, + .verify_authorizer_reply = ceph_x_verify_authorizer_reply, + .destroy_authorizer = ceph_x_destroy_authorizer, + .invalidate_authorizer = ceph_x_invalidate_authorizer, + .reset = ceph_x_reset, + .destroy = ceph_x_destroy, +}; + + +int ceph_x_init(struct ceph_auth_client *ac) +{ + struct ceph_x_info *xi; + int ret; + + dout("ceph_x_init %p\n", ac); + xi = kzalloc(sizeof(*xi), GFP_NOFS); + if (!xi) + return -ENOMEM; + + ret = -ENOMEM; + ceph_x_ticketbuf_cachep = kmem_cache_create("ceph_x_ticketbuf", + TEMP_TICKET_BUF_LEN, 8, + (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD), + NULL); + if (!ceph_x_ticketbuf_cachep) + goto done_nomem; + ret = -EINVAL; + if (!ac->secret) { + pr_err("no secret set (for auth_x protocol)\n"); + goto done_nomem; + } + + ret = ceph_crypto_key_unarmor(&xi->secret, ac->secret); + if (ret) + goto done_nomem; + + xi->starting = true; + xi->ticket_handlers = RB_ROOT; + + ac->protocol = CEPH_AUTH_CEPHX; + ac->private = xi; + ac->ops = &ceph_x_ops; + return 0; + +done_nomem: + kfree(xi); + if (ceph_x_ticketbuf_cachep) + kmem_cache_destroy(ceph_x_ticketbuf_cachep); + return ret; +} + + diff --git a/fs/ceph/auth_x.h b/fs/ceph/auth_x.h new file mode 100644 index 00000000000..ff6f8180e68 --- /dev/null +++ b/fs/ceph/auth_x.h @@ -0,0 +1,49 @@ +#ifndef _FS_CEPH_AUTH_X_H +#define _FS_CEPH_AUTH_X_H + +#include <linux/rbtree.h> + +#include "crypto.h" +#include "auth.h" +#include "auth_x_protocol.h" + +/* + * Handle ticket for a single service. + */ +struct ceph_x_ticket_handler { + struct rb_node node; + unsigned service; + + struct ceph_crypto_key session_key; + struct ceph_timespec validity; + + u64 secret_id; + struct ceph_buffer *ticket_blob; + + unsigned long renew_after, expires; +}; + + +struct ceph_x_authorizer { + struct ceph_buffer *buf; + unsigned service; + u64 nonce; + char reply_buf[128]; /* big enough for encrypted blob */ +}; + +struct ceph_x_info { + struct ceph_crypto_key secret; + + bool starting; + u64 server_challenge; + + unsigned have_keys; + struct rb_root ticket_handlers; + + struct ceph_x_authorizer auth_authorizer; +}; + +extern int ceph_x_init(struct ceph_auth_client *ac); + +#endif + diff --git a/fs/ceph/auth_x_protocol.h b/fs/ceph/auth_x_protocol.h new file mode 100644 index 00000000000..671d30576c4 --- /dev/null +++ b/fs/ceph/auth_x_protocol.h @@ -0,0 +1,90 @@ +#ifndef __FS_CEPH_AUTH_X_PROTOCOL +#define __FS_CEPH_AUTH_X_PROTOCOL + +#define CEPHX_GET_AUTH_SESSION_KEY 0x0100 +#define CEPHX_GET_PRINCIPAL_SESSION_KEY 0x0200 +#define CEPHX_GET_ROTATING_KEY 0x0400 + +/* common bits */ +struct ceph_x_ticket_blob { + __u8 struct_v; + __le64 secret_id; + __le32 blob_len; + char blob[]; +} __attribute__ ((packed)); + + +/* common request/reply headers */ +struct ceph_x_request_header { + __le16 op; +} __attribute__ ((packed)); + +struct ceph_x_reply_header { + __le16 op; + __le32 result; +} __attribute__ ((packed)); + + +/* authenticate handshake */ + +/* initial hello (no reply header) */ +struct ceph_x_server_challenge { + __u8 struct_v; + __le64 server_challenge; +} __attribute__ ((packed)); + +struct ceph_x_authenticate { + __u8 struct_v; + __le64 client_challenge; + __le64 key; + /* ticket blob */ +} __attribute__ ((packed)); + +struct ceph_x_service_ticket_request { + __u8 struct_v; + __le32 keys; +} __attribute__ ((packed)); + +struct ceph_x_challenge_blob { + __le64 server_challenge; + __le64 client_challenge; +} __attribute__ ((packed)); + + + +/* authorize handshake */ + +/* + * The authorizer consists of two pieces: + * a - service id, ticket blob + * b - encrypted with session key + */ +struct ceph_x_authorize_a { + __u8 struct_v; + __le64 global_id; + __le32 service_id; + struct ceph_x_ticket_blob ticket_blob; +} __attribute__ ((packed)); + +struct ceph_x_authorize_b { + __u8 struct_v; + __le64 nonce; +} __attribute__ ((packed)); + +struct ceph_x_authorize_reply { + __u8 struct_v; + __le64 nonce_plus_one; +} __attribute__ ((packed)); + + +/* + * encyption bundle + */ +#define CEPHX_ENC_MAGIC 0xff009cad8826aa55ull + +struct ceph_x_encrypt_header { + __u8 struct_v; + __le64 magic; +} __attribute__ ((packed)); + +#endif diff --git a/fs/ceph/buffer.c b/fs/ceph/buffer.c new file mode 100644 index 00000000000..b98086c7aeb --- /dev/null +++ b/fs/ceph/buffer.c @@ -0,0 +1,78 @@ + +#include "ceph_debug.h" +#include "buffer.h" +#include "decode.h" + +struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp) +{ + struct ceph_buffer *b; + + b = kmalloc(sizeof(*b), gfp); + if (!b) + return NULL; + + b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN); + if (b->vec.iov_base) { + b->is_vmalloc = false; + } else { + b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL); + if (!b->vec.iov_base) { + kfree(b); + return NULL; + } + b->is_vmalloc = true; + } + + kref_init(&b->kref); + b->alloc_len = len; + b->vec.iov_len = len; + dout("buffer_new %p\n", b); + return b; +} + +void ceph_buffer_release(struct kref *kref) +{ + struct ceph_buffer *b = container_of(kref, struct ceph_buffer, kref); + + dout("buffer_release %p\n", b); + if (b->vec.iov_base) { + if (b->is_vmalloc) + vfree(b->vec.iov_base); + else + kfree(b->vec.iov_base); + } + kfree(b); +} + +int ceph_buffer_alloc(struct ceph_buffer *b, int len, gfp_t gfp) +{ + b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN); + if (b->vec.iov_base) { + b->is_vmalloc = false; + } else { + b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL); + b->is_vmalloc = true; + } + if (!b->vec.iov_base) + return -ENOMEM; + b->alloc_len = len; + b->vec.iov_len = len; + return 0; +} + +int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end) +{ + size_t len; + + ceph_decode_need(p, end, sizeof(u32), bad); + len = ceph_decode_32(p); + dout("decode_buffer len %d\n", (int)len); + ceph_decode_need(p, end, len, bad); + *b = ceph_buffer_new(len, GFP_NOFS); + if (!*b) + return -ENOMEM; + ceph_decode_copy(p, (*b)->vec.iov_base, len); + return 0; +bad: + return -EINVAL; +} diff --git a/fs/ceph/buffer.h b/fs/ceph/buffer.h new file mode 100644 index 00000000000..58d19014068 --- /dev/null +++ b/fs/ceph/buffer.h @@ -0,0 +1,39 @@ +#ifndef __FS_CEPH_BUFFER_H +#define __FS_CEPH_BUFFER_H + +#include <linux/kref.h> +#include <linux/mm.h> +#include <linux/vmalloc.h> +#include <linux/types.h> +#include <linux/uio.h> + +/* + * a simple reference counted buffer. + * + * use kmalloc for small sizes (<= one page), vmalloc for larger + * sizes. + */ +struct ceph_buffer { + struct kref kref; + struct kvec vec; + size_t alloc_len; + bool is_vmalloc; +}; + +extern struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp); +extern void ceph_buffer_release(struct kref *kref); + +static inline struct ceph_buffer *ceph_buffer_get(struct ceph_buffer *b) +{ + kref_get(&b->kref); + return b; +} + +static inline void ceph_buffer_put(struct ceph_buffer *b) +{ + kref_put(&b->kref, ceph_buffer_release); +} + +extern int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end); + +#endif diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c new file mode 100644 index 00000000000..db122bb357b --- /dev/null +++ b/fs/ceph/caps.c @@ -0,0 +1,2927 @@ +#include "ceph_debug.h" + +#include <linux/fs.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/vmalloc.h> +#include <linux/wait.h> +#include <linux/writeback.h> + +#include "super.h" +#include "decode.h" +#include "messenger.h" + +/* + * Capability management + * + * The Ceph metadata servers control client access to inode metadata + * and file data by issuing capabilities, granting clients permission + * to read and/or write both inode field and file data to OSDs + * (storage nodes). Each capability consists of a set of bits + * indicating which operations are allowed. + * + * If the client holds a *_SHARED cap, the client has a coherent value + * that can be safely read from the cached inode. + * + * In the case of a *_EXCL (exclusive) or FILE_WR capabilities, the + * client is allowed to change inode attributes (e.g., file size, + * mtime), note its dirty state in the ceph_cap, and asynchronously + * flush that metadata change to the MDS. + * + * In the event of a conflicting operation (perhaps by another + * client), the MDS will revoke the conflicting client capabilities. + * + * In order for a client to cache an inode, it must hold a capability + * with at least one MDS server. When inodes are released, release + * notifications are batched and periodically sent en masse to the MDS + * cluster to release server state. + */ + + +/* + * Generate readable cap strings for debugging output. + */ +#define MAX_CAP_STR 20 +static char cap_str[MAX_CAP_STR][40]; +static DEFINE_SPINLOCK(cap_str_lock); +static int last_cap_str; + +static char *gcap_string(char *s, int c) +{ + if (c & CEPH_CAP_GSHARED) + *s++ = 's'; + if (c & CEPH_CAP_GEXCL) + *s++ = 'x'; + if (c & CEPH_CAP_GCACHE) + *s++ = 'c'; + if (c & CEPH_CAP_GRD) + *s++ = 'r'; + if (c & CEPH_CAP_GWR) + *s++ = 'w'; + if (c & CEPH_CAP_GBUFFER) + *s++ = 'b'; + if (c & CEPH_CAP_GLAZYIO) + *s++ = 'l'; + return s; +} + +const char *ceph_cap_string(int caps) +{ + int i; + char *s; + int c; + + spin_lock(&cap_str_lock); + i = last_cap_str++; + if (last_cap_str == MAX_CAP_STR) + last_cap_str = 0; + spin_unlock(&cap_str_lock); + + s = cap_str[i]; + + if (caps & CEPH_CAP_PIN) + *s++ = 'p'; + + c = (caps >> CEPH_CAP_SAUTH) & 3; + if (c) { + *s++ = 'A'; + s = gcap_string(s, c); + } + + c = (caps >> CEPH_CAP_SLINK) & 3; + if (c) { + *s++ = 'L'; + s = gcap_string(s, c); + } + + c = (caps >> CEPH_CAP_SXATTR) & 3; + if (c) { + *s++ = 'X'; + s = gcap_string(s, c); + } + + c = caps >> CEPH_CAP_SFILE; + if (c) { + *s++ = 'F'; + s = gcap_string(s, c); + } + + if (s == cap_str[i]) + *s++ = '-'; + *s = 0; + return cap_str[i]; +} + +/* + * Cap reservations + * + * Maintain a global pool of preallocated struct ceph_caps, referenced + * by struct ceph_caps_reservations. This ensures that we preallocate + * memory needed to successfully process an MDS response. (If an MDS + * sends us cap information and we fail to process it, we will have + * problems due to the client and MDS being out of sync.) + * + * Reservations are 'owned' by a ceph_cap_reservation context. + */ +static spinlock_t caps_list_lock; +static struct list_head caps_list; /* unused (reserved or unreserved) */ +static int caps_total_count; /* total caps allocated */ +static int caps_use_count; /* in use */ +static int caps_reserve_count; /* unused, reserved */ +static int caps_avail_count; /* unused, unreserved */ +static int caps_min_count; /* keep at least this many (unreserved) */ + +void __init ceph_caps_init(void) +{ + INIT_LIST_HEAD(&caps_list); + spin_lock_init(&caps_list_lock); +} + +void ceph_caps_finalize(void) +{ + struct ceph_cap *cap; + + spin_lock(&caps_list_lock); + while (!list_empty(&caps_list)) { + cap = list_first_entry(&caps_list, struct ceph_cap, caps_item); + list_del(&cap->caps_item); + kmem_cache_free(ceph_cap_cachep, cap); + } + caps_total_count = 0; + caps_avail_count = 0; + caps_use_count = 0; + caps_reserve_count = 0; + caps_min_count = 0; + spin_unlock(&caps_list_lock); +} + +void ceph_adjust_min_caps(int delta) +{ + spin_lock(&caps_list_lock); + caps_min_count += delta; + BUG_ON(caps_min_count < 0); + spin_unlock(&caps_list_lock); +} + +int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need) +{ + int i; + struct ceph_cap *cap; + int have; + int alloc = 0; + LIST_HEAD(newcaps); + int ret = 0; + + dout("reserve caps ctx=%p need=%d\n", ctx, need); + + /* first reserve any caps that are already allocated */ + spin_lock(&caps_list_lock); + if (caps_avail_count >= need) + have = need; + else + have = caps_avail_count; + caps_avail_count -= have; + caps_reserve_count += have; + BUG_ON(caps_total_count != caps_use_count + caps_reserve_count + + caps_avail_count); + spin_unlock(&caps_list_lock); + + for (i = have; i < need; i++) { + cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); + if (!cap) { + ret = -ENOMEM; + goto out_alloc_count; + } + list_add(&cap->caps_item, &newcaps); + alloc++; + } + BUG_ON(have + alloc != need); + + spin_lock(&caps_list_lock); + caps_total_count += alloc; + caps_reserve_count += alloc; + list_splice(&newcaps, &caps_list); + + BUG_ON(caps_total_count != caps_use_count + caps_reserve_count + + caps_avail_count); + spin_unlock(&caps_list_lock); + + ctx->count = need; + dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n", + ctx, caps_total_count, caps_use_count, caps_reserve_count, + caps_avail_count); + return 0; + +out_alloc_count: + /* we didn't manage to reserve as much as we needed */ + pr_warning("reserve caps ctx=%p ENOMEM need=%d got=%d\n", + ctx, need, have); + return ret; +} + +int ceph_unreserve_caps(struct ceph_cap_reservation *ctx) +{ + dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count); + if (ctx->count) { + spin_lock(&caps_list_lock); + BUG_ON(caps_reserve_count < ctx->count); + caps_reserve_count -= ctx->count; + caps_avail_count += ctx->count; + ctx->count = 0; + dout("unreserve caps %d = %d used + %d resv + %d avail\n", + caps_total_count, caps_use_count, caps_reserve_count, + caps_avail_count); + BUG_ON(caps_total_count != caps_use_count + caps_reserve_count + + caps_avail_count); + spin_unlock(&caps_list_lock); + } + return 0; +} + +static struct ceph_cap *get_cap(struct ceph_cap_reservation *ctx) +{ + struct ceph_cap *cap = NULL; + + /* temporary, until we do something about cap import/export */ + if (!ctx) + return kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); + + spin_lock(&caps_list_lock); + dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n", + ctx, ctx->count, caps_total_count, caps_use_count, + caps_reserve_count, caps_avail_count); + BUG_ON(!ctx->count); + BUG_ON(ctx->count > caps_reserve_count); + BUG_ON(list_empty(&caps_list)); + + ctx->count--; + caps_reserve_count--; + caps_use_count++; + + cap = list_first_entry(&caps_list, struct ceph_cap, caps_item); + list_del(&cap->caps_item); + + BUG_ON(caps_total_count != caps_use_count + caps_reserve_count + + caps_avail_count); + spin_unlock(&caps_list_lock); + return cap; +} + +void ceph_put_cap(struct ceph_cap *cap) +{ + spin_lock(&caps_list_lock); + dout("put_cap %p %d = %d used + %d resv + %d avail\n", + cap, caps_total_count, caps_use_count, + caps_reserve_count, caps_avail_count); + caps_use_count--; + /* + * Keep some preallocated caps around (ceph_min_count), to + * avoid lots of free/alloc churn. + */ + if (caps_avail_count >= caps_reserve_count + caps_min_count) { + caps_total_count--; + kmem_cache_free(ceph_cap_cachep, cap); + } else { + caps_avail_count++; + list_add(&cap->caps_item, &caps_list); + } + + BUG_ON(caps_total_count != caps_use_count + caps_reserve_count + + caps_avail_count); + spin_unlock(&caps_list_lock); +} + +void ceph_reservation_status(struct ceph_client *client, + int *total, int *avail, int *used, int *reserved, + int *min) +{ + if (total) + *total = caps_total_count; + if (avail) + *avail = caps_avail_count; + if (used) + *used = caps_use_count; + if (reserved) + *reserved = caps_reserve_count; + if (min) + *min = caps_min_count; +} + +/* + * Find ceph_cap for given mds, if any. + * + * Called with i_lock held. + */ +static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds) +{ + struct ceph_cap *cap; + struct rb_node *n = ci->i_caps.rb_node; + + while (n) { + cap = rb_entry(n, struct ceph_cap, ci_node); + if (mds < cap->mds) + n = n->rb_left; + else if (mds > cap->mds) + n = n->rb_right; + else + return cap; + } + return NULL; +} + +/* + * Return id of any MDS with a cap, preferably FILE_WR|WRBUFFER|EXCL, else + * -1. + */ +static int __ceph_get_cap_mds(struct ceph_inode_info *ci, u32 *mseq) +{ + struct ceph_cap *cap; + int mds = -1; + struct rb_node *p; + + /* prefer mds with WR|WRBUFFER|EXCL caps */ + for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { + cap = rb_entry(p, struct ceph_cap, ci_node); + mds = cap->mds; + if (mseq) + *mseq = cap->mseq; + if (cap->issued & (CEPH_CAP_FILE_WR | + CEPH_CAP_FILE_BUFFER | + CEPH_CAP_FILE_EXCL)) + break; + } + return mds; +} + +int ceph_get_cap_mds(struct inode *inode) +{ + int mds; + spin_lock(&inode->i_lock); + mds = __ceph_get_cap_mds(ceph_inode(inode), NULL); + spin_unlock(&inode->i_lock); + return mds; +} + +/* + * Called under i_lock. + */ +static void __insert_cap_node(struct ceph_inode_info *ci, + struct ceph_cap *new) +{ + struct rb_node **p = &ci->i_caps.rb_node; + struct rb_node *parent = NULL; + struct ceph_cap *cap = NULL; + + while (*p) { + parent = *p; + cap = rb_entry(parent, struct ceph_cap, ci_node); + if (new->mds < cap->mds) + p = &(*p)->rb_left; + else if (new->mds > cap->mds) + p = &(*p)->rb_right; + else + BUG(); + } + + rb_link_node(&new->ci_node, parent, p); + rb_insert_color(&new->ci_node, &ci->i_caps); +} + +/* + * (re)set cap hold timeouts, which control the delayed release + * of unused caps back to the MDS. Should be called on cap use. + */ +static void __cap_set_timeouts(struct ceph_mds_client *mdsc, + struct ceph_inode_info *ci) +{ + struct ceph_mount_args *ma = mdsc->client->mount_args; + + ci->i_hold_caps_min = round_jiffies(jiffies + + ma->caps_wanted_delay_min * HZ); + ci->i_hold_caps_max = round_jiffies(jiffies + + ma->caps_wanted_delay_max * HZ); + dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode, + ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies); +} + +/* + * (Re)queue cap at the end of the delayed cap release list. + * + * If I_FLUSH is set, leave the inode at the front of the list. + * + * Caller holds i_lock + * -> we take mdsc->cap_delay_lock + */ +static void __cap_delay_requeue(struct ceph_mds_client *mdsc, + struct ceph_inode_info *ci) +{ + __cap_set_timeouts(mdsc, ci); + dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode, + ci->i_ceph_flags, ci->i_hold_caps_max); + if (!mdsc->stopping) { + spin_lock(&mdsc->cap_delay_lock); + if (!list_empty(&ci->i_cap_delay_list)) { + if (ci->i_ceph_flags & CEPH_I_FLUSH) + goto no_change; + list_del_init(&ci->i_cap_delay_list); + } + list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list); +no_change: + spin_unlock(&mdsc->cap_delay_lock); + } +} + +/* + * Queue an inode for immediate writeback. Mark inode with I_FLUSH, + * indicating we should send a cap message to flush dirty metadata + * asap, and move to the front of the delayed cap list. + */ +static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc, + struct ceph_inode_info *ci) +{ + dout("__cap_delay_requeue_front %p\n", &ci->vfs_inode); + spin_lock(&mdsc->cap_delay_lock); + ci->i_ceph_flags |= CEPH_I_FLUSH; + if (!list_empty(&ci->i_cap_delay_list)) + list_del_init(&ci->i_cap_delay_list); + list_add(&ci->i_cap_delay_list, &mdsc->cap_delay_list); + spin_unlock(&mdsc->cap_delay_lock); +} + +/* + * Cancel delayed work on cap. + * + * Caller must hold i_lock. + */ +static void __cap_delay_cancel(struct ceph_mds_client *mdsc, + struct ceph_inode_info *ci) +{ + dout("__cap_delay_cancel %p\n", &ci->vfs_inode); + if (list_empty(&ci->i_cap_delay_list)) + return; + spin_lock(&mdsc->cap_delay_lock); + list_del_init(&ci->i_cap_delay_list); + spin_unlock(&mdsc->cap_delay_lock); +} + +/* + * Common issue checks for add_cap, handle_cap_grant. + */ +static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, + unsigned issued) +{ + unsigned had = __ceph_caps_issued(ci, NULL); + + /* + * Each time we receive FILE_CACHE anew, we increment + * i_rdcache_gen. + */ + if ((issued & CEPH_CAP_FILE_CACHE) && + (had & CEPH_CAP_FILE_CACHE) == 0) + ci->i_rdcache_gen++; + + /* + * if we are newly issued FILE_SHARED, clear I_COMPLETE; we + * don't know what happened to this directory while we didn't + * have the cap. + */ + if ((issued & CEPH_CAP_FILE_SHARED) && + (had & CEPH_CAP_FILE_SHARED) == 0) { + ci->i_shared_gen++; + if (S_ISDIR(ci->vfs_inode.i_mode)) { + dout(" marking %p NOT complete\n", &ci->vfs_inode); + ci->i_ceph_flags &= ~CEPH_I_COMPLETE; + } + } +} + +/* + * Add a capability under the given MDS session. + * + * Caller should hold session snap_rwsem (read) and s_mutex. + * + * @fmode is the open file mode, if we are opening a file, otherwise + * it is < 0. (This is so we can atomically add the cap and add an + * open file reference to it.) + */ +int ceph_add_cap(struct inode *inode, + struct ceph_mds_session *session, u64 cap_id, + int fmode, unsigned issued, unsigned wanted, + unsigned seq, unsigned mseq, u64 realmino, int flags, + struct ceph_cap_reservation *caps_reservation) +{ + struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_cap *new_cap = NULL; + struct ceph_cap *cap; + int mds = session->s_mds; + int actual_wanted; + + dout("add_cap %p mds%d cap %llx %s seq %d\n", inode, + session->s_mds, cap_id, ceph_cap_string(issued), seq); + + /* + * If we are opening the file, include file mode wanted bits + * in wanted. + */ + if (fmode >= 0) + wanted |= ceph_caps_for_mode(fmode); + +retry: + spin_lock(&inode->i_lock); + cap = __get_cap_for_mds(ci, mds); + if (!cap) { + if (new_cap) { + cap = new_cap; + new_cap = NULL; + } else { + spin_unlock(&inode->i_lock); + new_cap = get_cap(caps_reservation); + if (new_cap == NULL) + return -ENOMEM; + goto retry; + } + + cap->issued = 0; + cap->implemented = 0; + cap->mds = mds; + cap->mds_wanted = 0; + + cap->ci = ci; + __insert_cap_node(ci, cap); + + /* clear out old exporting info? (i.e. on cap import) */ + if (ci->i_cap_exporting_mds == mds) { + ci->i_cap_exporting_issued = 0; + ci->i_cap_exporting_mseq = 0; + ci->i_cap_exporting_mds = -1; + } + + /* add to session cap list */ + cap->session = session; + spin_lock(&session->s_cap_lock); + list_add_tail(&cap->session_caps, &session->s_caps); + session->s_nr_caps++; + spin_unlock(&session->s_cap_lock); + } + + if (!ci->i_snap_realm) { + /* + * add this inode to the appropriate snap realm + */ + struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc, + realmino); + if (realm) { + ceph_get_snap_realm(mdsc, realm); + spin_lock(&realm->inodes_with_caps_lock); + ci->i_snap_realm = realm; + list_add(&ci->i_snap_realm_item, + &realm->inodes_with_caps); + spin_unlock(&realm->inodes_with_caps_lock); + } else { + pr_err("ceph_add_cap: couldn't find snap realm %llx\n", + realmino); + } + } + + __check_cap_issue(ci, cap, issued); + + /* + * If we are issued caps we don't want, or the mds' wanted + * value appears to be off, queue a check so we'll release + * later and/or update the mds wanted value. + */ + actual_wanted = __ceph_caps_wanted(ci); + if ((wanted & ~actual_wanted) || + (issued & ~actual_wanted & CEPH_CAP_ANY_WR)) { + dout(" issued %s, mds wanted %s, actual %s, queueing\n", + ceph_cap_string(issued), ceph_cap_string(wanted), + ceph_cap_string(actual_wanted)); + __cap_delay_requeue(mdsc, ci); + } + + if (flags & CEPH_CAP_FLAG_AUTH) + ci->i_auth_cap = cap; + else if (ci->i_auth_cap == cap) + ci->i_auth_cap = NULL; + + dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n", + inode, ceph_vinop(inode), cap, ceph_cap_string(issued), + ceph_cap_string(issued|cap->issued), seq, mds); + cap->cap_id = cap_id; + cap->issued = issued; + cap->implemented |= issued; + cap->mds_wanted |= wanted; + cap->seq = seq; + cap->issue_seq = seq; + cap->mseq = mseq; + cap->cap_gen = session->s_cap_gen; + + if (fmode >= 0) + __ceph_get_fmode(ci, fmode); + spin_unlock(&inode->i_lock); + wake_up(&ci->i_cap_wq); + return 0; +} + +/* + * Return true if cap has not timed out and belongs to the current + * generation of the MDS session (i.e. has not gone 'stale' due to + * us losing touch with the mds). + */ +static int __cap_is_valid(struct ceph_cap *cap) +{ + unsigned long ttl; + u32 gen; + + spin_lock(&cap->session->s_cap_lock); + gen = cap->session->s_cap_gen; + ttl = cap->session->s_cap_ttl; + spin_unlock(&cap->session->s_cap_lock); + + if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) { + dout("__cap_is_valid %p cap %p issued %s " + "but STALE (gen %u vs %u)\n", &cap->ci->vfs_inode, + cap, ceph_cap_string(cap->issued), cap->cap_gen, gen); + return 0; + } + + return 1; +} + +/* + * Return set of valid cap bits issued to us. Note that caps time + * out, and may be invalidated in bulk if the client session times out + * and session->s_cap_gen is bumped. + */ +int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented) +{ + int have = ci->i_snap_caps | ci->i_cap_exporting_issued; + struct ceph_cap *cap; + struct rb_node *p; + + if (implemented) + *implemented = 0; + for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { + cap = rb_entry(p, struct ceph_cap, ci_node); + if (!__cap_is_valid(cap)) + continue; + dout("__ceph_caps_issued %p cap %p issued %s\n", + &ci->vfs_inode, cap, ceph_cap_string(cap->issued)); + have |= cap->issued; + if (implemented) + *implemented |= cap->implemented; + } + return have; +} + +/* + * Get cap bits issued by caps other than @ocap + */ +int __ceph_caps_issued_other(struct ceph_inode_info *ci, struct ceph_cap *ocap) +{ + int have = ci->i_snap_caps; + struct ceph_cap *cap; + struct rb_node *p; + + for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { + cap = rb_entry(p, struct ceph_cap, ci_node); + if (cap == ocap) + continue; + if (!__cap_is_valid(cap)) + continue; + have |= cap->issued; + } + return have; +} + +/* + * Move a cap to the end of the LRU (oldest caps at list head, newest + * at list tail). + */ +static void __touch_cap(struct ceph_cap *cap) +{ + struct ceph_mds_session *s = cap->session; + + spin_lock(&s->s_cap_lock); + if (s->s_cap_iterator == NULL) { + dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap, + s->s_mds); + list_move_tail(&cap->session_caps, &s->s_caps); + } else { + dout("__touch_cap %p cap %p mds%d NOP, iterating over caps\n", + &cap->ci->vfs_inode, cap, s->s_mds); + } + spin_unlock(&s->s_cap_lock); +} + +/* + * Check if we hold the given mask. If so, move the cap(s) to the + * front of their respective LRUs. (This is the preferred way for + * callers to check for caps they want.) + */ +int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) +{ + struct ceph_cap *cap; + struct rb_node *p; + int have = ci->i_snap_caps; + + if ((have & mask) == mask) { + dout("__ceph_caps_issued_mask %p snap issued %s" + " (mask %s)\n", &ci->vfs_inode, + ceph_cap_string(have), + ceph_cap_string(mask)); + return 1; + } + + for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { + cap = rb_entry(p, struct ceph_cap, ci_node); + if (!__cap_is_valid(cap)) + continue; + if ((cap->issued & mask) == mask) { + dout("__ceph_caps_issued_mask %p cap %p issued %s" + " (mask %s)\n", &ci->vfs_inode, cap, + ceph_cap_string(cap->issued), + ceph_cap_string(mask)); + if (touch) + __touch_cap(cap); + return 1; + } + + /* does a combination of caps satisfy mask? */ + have |= cap->issued; + if ((have & mask) == mask) { + dout("__ceph_caps_issued_mask %p combo issued %s" + " (mask %s)\n", &ci->vfs_inode, + ceph_cap_string(cap->issued), + ceph_cap_string(mask)); + if (touch) { + struct rb_node *q; + + /* touch this + preceeding caps */ + __touch_cap(cap); + for (q = rb_first(&ci->i_caps); q != p; + q = rb_next(q)) { + cap = rb_entry(q, struct ceph_cap, + ci_node); + if (!__cap_is_valid(cap)) + continue; + __touch_cap(cap); + } + } + return 1; + } + } + + return 0; +} + +/* + * Return true if mask caps are currently being revoked by an MDS. + */ +int ceph_caps_revoking(struct ceph_inode_info *ci, int mask) +{ + struct inode *inode = &ci->vfs_inode; + struct ceph_cap *cap; + struct rb_node *p; + int ret = 0; + + spin_lock(&inode->i_lock); + for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { + cap = rb_entry(p, struct ceph_cap, ci_node); + if (__cap_is_valid(cap) && + (cap->implemented & ~cap->issued & mask)) { + ret = 1; + break; + } + } + spin_unlock(&inode->i_lock); + dout("ceph_caps_revoking %p %s = %d\n", inode, + ceph_cap_string(mask), ret); + return ret; +} + +int __ceph_caps_used(struct ceph_inode_info *ci) +{ + int used = 0; + if (ci->i_pin_ref) + used |= CEPH_CAP_PIN; + if (ci->i_rd_ref) + used |= CEPH_CAP_FILE_RD; + if (ci->i_rdcache_ref || ci->i_rdcache_gen) + used |= CEPH_CAP_FILE_CACHE; + if (ci->i_wr_ref) + used |= CEPH_CAP_FILE_WR; + if (ci->i_wrbuffer_ref) + used |= CEPH_CAP_FILE_BUFFER; + return used; +} + +/* + * wanted, by virtue of open file modes + */ +int __ceph_caps_file_wanted(struct ceph_inode_info *ci) +{ + int want = 0; + int mode; + for (mode = 0; mode < 4; mode++) + if (ci->i_nr_by_mode[mode]) + want |= ceph_caps_for_mode(mode); + return want; +} + +/* + * Return caps we have registered with the MDS(s) as 'wanted'. + */ +int __ceph_caps_mds_wanted(struct ceph_inode_info *ci) +{ + struct ceph_cap *cap; + struct rb_node *p; + int mds_wanted = 0; + + for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { + cap = rb_entry(p, struct ceph_cap, ci_node); + if (!__cap_is_valid(cap)) + continue; + mds_wanted |= cap->mds_wanted; + } + return mds_wanted; +} + +/* + * called under i_lock + */ +static int __ceph_is_any_caps(struct ceph_inode_info *ci) +{ + return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_mds >= 0; +} + +/* + * caller should hold i_lock. + * caller will not hold session s_mutex if called from destroy_inode. + */ +void __ceph_remove_cap(struct ceph_cap *cap) +{ + struct ceph_mds_session *session = cap->session; + struct ceph_inode_info *ci = cap->ci; + struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc; + + dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode); + + /* remove from inode list */ + rb_erase(&cap->ci_node, &ci->i_caps); + cap->ci = NULL; + if (ci->i_auth_cap == cap) + ci->i_auth_cap = NULL; + + /* remove from session list */ + spin_lock(&session->s_cap_lock); + if (session->s_cap_iterator == cap) { + /* not yet, we are iterating over this very cap */ + dout("__ceph_remove_cap delaying %p removal from session %p\n", + cap, cap->session); + } else { + list_del_init(&cap->session_caps); + session->s_nr_caps--; + cap->session = NULL; + } + spin_unlock(&session->s_cap_lock); + + if (cap->session == NULL) + ceph_put_cap(cap); + + if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) { + struct ceph_snap_realm *realm = ci->i_snap_realm; + spin_lock(&realm->inodes_with_caps_lock); + list_del_init(&ci->i_snap_realm_item); + ci->i_snap_realm_counter++; + ci->i_snap_realm = NULL; + spin_unlock(&realm->inodes_with_caps_lock); + ceph_put_snap_realm(mdsc, realm); + } + if (!__ceph_is_any_real_caps(ci)) + __cap_delay_cancel(mdsc, ci); +} + +/* + * Build and send a cap message to the given MDS. + * + * Caller should be holding s_mutex. + */ +static int send_cap_msg(struct ceph_mds_session *session, + u64 ino, u64 cid, int op, + int caps, int wanted, int dirty, + u32 seq, u64 flush_tid, u32 issue_seq, u32 mseq, + u64 size, u64 max_size, + struct timespec *mtime, struct timespec *atime, + u64 time_warp_seq, + uid_t uid, gid_t gid, mode_t mode, + u64 xattr_version, + struct ceph_buffer *xattrs_buf, + u64 follows) +{ + struct ceph_mds_caps *fc; + struct ceph_msg *msg; + + dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s" + " seq %u/%u mseq %u follows %lld size %llu/%llu" + " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(op), + cid, ino, ceph_cap_string(caps), ceph_cap_string(wanted), + ceph_cap_string(dirty), + seq, issue_seq, mseq, follows, size, max_size, + xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0); + + msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), 0, 0, NULL); + if (IS_ERR(msg)) + return PTR_ERR(msg); + + msg->hdr.tid = cpu_to_le64(flush_tid); + + fc = msg->front.iov_base; + memset(fc, 0, sizeof(*fc)); + + fc->cap_id = cpu_to_le64(cid); + fc->op = cpu_to_le32(op); + fc->seq = cpu_to_le32(seq); + fc->issue_seq = cpu_to_le32(issue_seq); + fc->migrate_seq = cpu_to_le32(mseq); + fc->caps = cpu_to_le32(caps); + fc->wanted = cpu_to_le32(wanted); + fc->dirty = cpu_to_le32(dirty); + fc->ino = cpu_to_le64(ino); + fc->snap_follows = cpu_to_le64(follows); + + fc->size = cpu_to_le64(size); + fc->max_size = cpu_to_le64(max_size); + if (mtime) + ceph_encode_timespec(&fc->mtime, mtime); + if (atime) + ceph_encode_timespec(&fc->atime, atime); + fc->time_warp_seq = cpu_to_le32(time_warp_seq); + + fc->uid = cpu_to_le32(uid); + fc->gid = cpu_to_le32(gid); + fc->mode = cpu_to_le32(mode); + + fc->xattr_version = cpu_to_le64(xattr_version); + if (xattrs_buf) { + msg->middle = ceph_buffer_get(xattrs_buf); + fc->xattr_len = cpu_to_le32(xattrs_buf->vec.iov_len); + msg->hdr.middle_len = cpu_to_le32(xattrs_buf->vec.iov_len); + } + + ceph_con_send(&session->s_con, msg); + return 0; +} + +/* + * Queue cap releases when an inode is dropped from our cache. Since + * inode is about to be destroyed, there is no need for i_lock. + */ +void ceph_queue_caps_release(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct rb_node *p; + + p = rb_first(&ci->i_caps); + while (p) { + struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node); + struct ceph_mds_session *session = cap->session; + struct ceph_msg *msg; + struct ceph_mds_cap_release *head; + struct ceph_mds_cap_item *item; + + spin_lock(&session->s_cap_lock); + BUG_ON(!session->s_num_cap_releases); + msg = list_first_entry(&session->s_cap_releases, + struct ceph_msg, list_head); + + dout(" adding %p release to mds%d msg %p (%d left)\n", + inode, session->s_mds, msg, session->s_num_cap_releases); + + BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE); + head = msg->front.iov_base; + head->num = cpu_to_le32(le32_to_cpu(head->num) + 1); + item = msg->front.iov_base + msg->front.iov_len; + item->ino = cpu_to_le64(ceph_ino(inode)); + item->cap_id = cpu_to_le64(cap->cap_id); + item->migrate_seq = cpu_to_le32(cap->mseq); + item->seq = cpu_to_le32(cap->issue_seq); + + session->s_num_cap_releases--; + + msg->front.iov_len += sizeof(*item); + if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) { + dout(" release msg %p full\n", msg); + list_move_tail(&msg->list_head, + &session->s_cap_releases_done); + } else { + dout(" release msg %p at %d/%d (%d)\n", msg, + (int)le32_to_cpu(head->num), + (int)CEPH_CAPS_PER_RELEASE, + (int)msg->front.iov_len); + } + spin_unlock(&session->s_cap_lock); + p = rb_next(p); + __ceph_remove_cap(cap); + } +} + +/* + * Send a cap msg on the given inode. Update our caps state, then + * drop i_lock and send the message. + * + * Make note of max_size reported/requested from mds, revoked caps + * that have now been implemented. + * + * Make half-hearted attempt ot to invalidate page cache if we are + * dropping RDCACHE. Note that this will leave behind locked pages + * that we'll then need to deal with elsewhere. + * + * Return non-zero if delayed release, or we experienced an error + * such that the caller should requeue + retry later. + * + * called with i_lock, then drops it. + * caller should hold snap_rwsem (read), s_mutex. + */ +static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, + int op, int used, int want, int retain, int flushing, + unsigned *pflush_tid) + __releases(cap->ci->vfs_inode->i_lock) +{ + struct ceph_inode_info *ci = cap->ci; + struct inode *inode = &ci->vfs_inode; + u64 cap_id = cap->cap_id; + int held, revoking, dropping, keep; + u64 seq, issue_seq, mseq, time_warp_seq, follows; + u64 size, max_size; + struct timespec mtime, atime; + int wake = 0; + mode_t mode; + uid_t uid; + gid_t gid; + struct ceph_mds_session *session; + u64 xattr_version = 0; + int delayed = 0; + u64 flush_tid = 0; + int i; + int ret; + + held = cap->issued | cap->implemented; + revoking = cap->implemented & ~cap->issued; + retain &= ~revoking; + dropping = cap->issued & ~retain; + + dout("__send_cap %p cap %p session %p %s -> %s (revoking %s)\n", + inode, cap, cap->session, + ceph_cap_string(held), ceph_cap_string(held & retain), + ceph_cap_string(revoking)); + BUG_ON((retain & CEPH_CAP_PIN) == 0); + + session = cap->session; + + /* don't release wanted unless we've waited a bit. */ + if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 && + time_before(jiffies, ci->i_hold_caps_min)) { + dout(" delaying issued %s -> %s, wanted %s -> %s on send\n", + ceph_cap_string(cap->issued), + ceph_cap_string(cap->issued & retain), + ceph_cap_string(cap->mds_wanted), + ceph_cap_string(want)); + want |= cap->mds_wanted; + retain |= cap->issued; + delayed = 1; + } + ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH); + + cap->issued &= retain; /* drop bits we don't want */ + if (cap->implemented & ~cap->issued) { + /* + * Wake up any waiters on wanted -> needed transition. + * This is due to the weird transition from buffered + * to sync IO... we need to flush dirty pages _before_ + * allowing sync writes to avoid reordering. + */ + wake = 1; + } + cap->implemented &= cap->issued | used; + cap->mds_wanted = want; + + if (flushing) { + /* + * assign a tid for flush operations so we can avoid + * flush1 -> dirty1 -> flush2 -> flushack1 -> mark + * clean type races. track latest tid for every bit + * so we can handle flush AxFw, flush Fw, and have the + * first ack clean Ax. + */ + flush_tid = ++ci->i_cap_flush_last_tid; + if (pflush_tid) + *pflush_tid = flush_tid; + dout(" cap_flush_tid %d\n", (int)flush_tid); + for (i = 0; i < CEPH_CAP_BITS; i++) + if (flushing & (1 << i)) + ci->i_cap_flush_tid[i] = flush_tid; + } + + keep = cap->implemented; + seq = cap->seq; + issue_seq = cap->issue_seq; + mseq = cap->mseq; + size = inode->i_size; + ci->i_reported_size = size; + max_size = ci->i_wanted_max_size; + ci->i_requested_max_size = max_size; + mtime = inode->i_mtime; + atime = inode->i_atime; + time_warp_seq = ci->i_time_warp_seq; + follows = ci->i_snap_realm->cached_context->seq; + uid = inode->i_uid; + gid = inode->i_gid; + mode = inode->i_mode; + + if (dropping & CEPH_CAP_XATTR_EXCL) { + __ceph_build_xattrs_blob(ci); + xattr_version = ci->i_xattrs.version + 1; + } + + spin_unlock(&inode->i_lock); + + ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id, + op, keep, want, flushing, seq, flush_tid, issue_seq, mseq, + size, max_size, &mtime, &atime, time_warp_seq, + uid, gid, mode, + xattr_version, + (flushing & CEPH_CAP_XATTR_EXCL) ? ci->i_xattrs.blob : NULL, + follows); + if (ret < 0) { + dout("error sending cap msg, must requeue %p\n", inode); + delayed = 1; + } + + if (wake) + wake_up(&ci->i_cap_wq); + + return delayed; +} + +/* + * When a snapshot is taken, clients accumulate dirty metadata on + * inodes with capabilities in ceph_cap_snaps to describe the file + * state at the time the snapshot was taken. This must be flushed + * asynchronously back to the MDS once sync writes complete and dirty + * data is written out. + * + * Called under i_lock. Takes s_mutex as needed. + */ +void __ceph_flush_snaps(struct ceph_inode_info *ci, + struct ceph_mds_session **psession) +{ + struct inode *inode = &ci->vfs_inode; + int mds; + struct ceph_cap_snap *capsnap; + u32 mseq; + struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc; + struct ceph_mds_session *session = NULL; /* if session != NULL, we hold + session->s_mutex */ + u64 next_follows = 0; /* keep track of how far we've gotten through the + i_cap_snaps list, and skip these entries next time + around to avoid an infinite loop */ + + if (psession) + session = *psession; + + dout("__flush_snaps %p\n", inode); +retry: + list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { + /* avoid an infiniute loop after retry */ + if (capsnap->follows < next_follows) + continue; + /* + * we need to wait for sync writes to complete and for dirty + * pages to be written out. + */ + if (capsnap->dirty_pages || capsnap->writing) + continue; + + /* pick mds, take s_mutex */ + mds = __ceph_get_cap_mds(ci, &mseq); + if (session && session->s_mds != mds) { + dout("oops, wrong session %p mutex\n", session); + mutex_unlock(&session->s_mutex); + ceph_put_mds_session(session); + session = NULL; + } + if (!session) { + spin_unlock(&inode->i_lock); + mutex_lock(&mdsc->mutex); + session = __ceph_lookup_mds_session(mdsc, mds); + mutex_unlock(&mdsc->mutex); + if (session) { + dout("inverting session/ino locks on %p\n", + session); + mutex_lock(&session->s_mutex); + } + /* + * if session == NULL, we raced against a cap + * deletion. retry, and we'll get a better + * @mds value next time. + */ + spin_lock(&inode->i_lock); + goto retry; + } + + capsnap->flush_tid = ++ci->i_cap_flush_last_tid; + atomic_inc(&capsnap->nref); + if (!list_empty(&capsnap->flushing_item)) + list_del_init(&capsnap->flushing_item); + list_add_tail(&capsnap->flushing_item, + &session->s_cap_snaps_flushing); + spin_unlock(&inode->i_lock); + + dout("flush_snaps %p cap_snap %p follows %lld size %llu\n", + inode, capsnap, next_follows, capsnap->size); + send_cap_msg(session, ceph_vino(inode).ino, 0, + CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0, + capsnap->dirty, 0, capsnap->flush_tid, 0, mseq, + capsnap->size, 0, + &capsnap->mtime, &capsnap->atime, + capsnap->time_warp_seq, + capsnap->uid, capsnap->gid, capsnap->mode, + 0, NULL, + capsnap->follows); + + next_follows = capsnap->follows + 1; + ceph_put_cap_snap(capsnap); + + spin_lock(&inode->i_lock); + goto retry; + } + + /* we flushed them all; remove this inode from the queue */ + spin_lock(&mdsc->snap_flush_lock); + list_del_init(&ci->i_snap_flush_item); + spin_unlock(&mdsc->snap_flush_lock); + + if (psession) + *psession = session; + else if (session) { + mutex_unlock(&session->s_mutex); + ceph_put_mds_session(session); + } +} + +static void ceph_flush_snaps(struct ceph_inode_info *ci) +{ + struct inode *inode = &ci->vfs_inode; + + spin_lock(&inode->i_lock); + __ceph_flush_snaps(ci, NULL); + spin_unlock(&inode->i_lock); +} + +/* + * Mark caps dirty. If inode is newly dirty, add to the global dirty + * list. + */ +void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) +{ + struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc; + struct inode *inode = &ci->vfs_inode; + int was = ci->i_dirty_caps; + int dirty = 0; + + dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode, + ceph_cap_string(mask), ceph_cap_string(was), + ceph_cap_string(was | mask)); + ci->i_dirty_caps |= mask; + if (was == 0) { + dout(" inode %p now dirty\n", &ci->vfs_inode); + BUG_ON(!list_empty(&ci->i_dirty_item)); + spin_lock(&mdsc->cap_dirty_lock); + list_add(&ci->i_dirty_item, &mdsc->cap_dirty); + spin_unlock(&mdsc->cap_dirty_lock); + if (ci->i_flushing_caps == 0) { + igrab(inode); + dirty |= I_DIRTY_SYNC; + } + } + BUG_ON(list_empty(&ci->i_dirty_item)); + if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) && + (mask & CEPH_CAP_FILE_BUFFER)) + dirty |= I_DIRTY_DATASYNC; + if (dirty) + __mark_inode_dirty(inode, dirty); + __cap_delay_requeue(mdsc, ci); +} + +/* + * Add dirty inode to the flushing list. Assigned a seq number so we + * can wait for caps to flush without starving. + * + * Called under i_lock. + */ +static int __mark_caps_flushing(struct inode *inode, + struct ceph_mds_session *session) +{ + struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc; + struct ceph_inode_info *ci = ceph_inode(inode); + int flushing; + + BUG_ON(ci->i_dirty_caps == 0); + BUG_ON(list_empty(&ci->i_dirty_item)); + + flushing = ci->i_dirty_caps; + dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n", + ceph_cap_string(flushing), + ceph_cap_string(ci->i_flushing_caps), + ceph_cap_string(ci->i_flushing_caps | flushing)); + ci->i_flushing_caps |= flushing; + ci->i_dirty_caps = 0; + dout(" inode %p now !dirty\n", inode); + + spin_lock(&mdsc->cap_dirty_lock); + list_del_init(&ci->i_dirty_item); + + ci->i_cap_flush_seq = ++mdsc->cap_flush_seq; + if (list_empty(&ci->i_flushing_item)) { + list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing); + mdsc->num_cap_flushing++; + dout(" inode %p now flushing seq %lld\n", inode, + ci->i_cap_flush_seq); + } else { + list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing); + dout(" inode %p now flushing (more) seq %lld\n", inode, + ci->i_cap_flush_seq); + } + spin_unlock(&mdsc->cap_dirty_lock); + + return flushing; +} + +/* + * try to invalidate mapping pages without blocking. + */ +static int mapping_is_empty(struct address_space *mapping) +{ + struct page *page = find_get_page(mapping, 0); + + if (!page) + return 1; + + put_page(page); + return 0; +} + +static int try_nonblocking_invalidate(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + u32 invalidating_gen = ci->i_rdcache_gen; + + spin_unlock(&inode->i_lock); + invalidate_mapping_pages(&inode->i_data, 0, -1); + spin_lock(&inode->i_lock); + + if (mapping_is_empty(&inode->i_data) && + invalidating_gen == ci->i_rdcache_gen) { + /* success. */ + dout("try_nonblocking_invalidate %p success\n", inode); + ci->i_rdcache_gen = 0; + ci->i_rdcache_revoking = 0; + return 0; + } + dout("try_nonblocking_invalidate %p failed\n", inode); + return -1; +} + +/* + * Swiss army knife function to examine currently used and wanted + * versus held caps. Release, flush, ack revoked caps to mds as + * appropriate. + * + * CHECK_CAPS_NODELAY - caller is delayed work and we should not delay + * cap release further. + * CHECK_CAPS_AUTHONLY - we should only check the auth cap + * CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without + * further delay. + */ +void ceph_check_caps(struct ceph_inode_info *ci, int flags, + struct ceph_mds_session *session) +{ + struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode); + struct ceph_mds_client *mdsc = &client->mdsc; + struct inode *inode = &ci->vfs_inode; + struct ceph_cap *cap; + int file_wanted, used; + int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */ + int drop_session_lock = session ? 0 : 1; + int issued, implemented, want, retain, revoking, flushing = 0; + int mds = -1; /* keep track of how far we've gone through i_caps list + to avoid an infinite loop on retry */ + struct rb_node *p; + int tried_invalidate = 0; + int delayed = 0, sent = 0, force_requeue = 0, num; + int queue_invalidate = 0; + int is_delayed = flags & CHECK_CAPS_NODELAY; + + /* if we are unmounting, flush any unused caps immediately. */ + if (mdsc->stopping) + is_delayed = 1; + + spin_lock(&inode->i_lock); + + if (ci->i_ceph_flags & CEPH_I_FLUSH) + flags |= CHECK_CAPS_FLUSH; + + /* flush snaps first time around only */ + if (!list_empty(&ci->i_cap_snaps)) + __ceph_flush_snaps(ci, &session); + goto retry_locked; +retry: + spin_lock(&inode->i_lock); +retry_locked: + file_wanted = __ceph_caps_file_wanted(ci); + used = __ceph_caps_used(ci); + want = file_wanted | used; + issued = __ceph_caps_issued(ci, &implemented); + revoking = implemented & ~issued; + + retain = want | CEPH_CAP_PIN; + if (!mdsc->stopping && inode->i_nlink > 0) { + if (want) { + retain |= CEPH_CAP_ANY; /* be greedy */ + } else { + retain |= CEPH_CAP_ANY_SHARED; + /* + * keep RD only if we didn't have the file open RW, + * because then the mds would revoke it anyway to + * journal max_size=0. + */ + if (ci->i_max_size == 0) + retain |= CEPH_CAP_ANY_RD; + } + } + + dout("check_caps %p file_want %s used %s dirty %s flushing %s" + " issued %s revoking %s retain %s %s%s%s\n", inode, + ceph_cap_string(file_wanted), + ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps), + ceph_cap_string(ci->i_flushing_caps), + ceph_cap_string(issued), ceph_cap_string(revoking), + ceph_cap_string(retain), + (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "", + (flags & CHECK_CAPS_NODELAY) ? " NODELAY" : "", + (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : ""); + + /* + * If we no longer need to hold onto old our caps, and we may + * have cached pages, but don't want them, then try to invalidate. + * If we fail, it's because pages are locked.... try again later. + */ + if ((!is_delayed || mdsc->stopping) && + ci->i_wrbuffer_ref == 0 && /* no dirty pages... */ + ci->i_rdcache_gen && /* may have cached pages */ + (file_wanted == 0 || /* no open files */ + (revoking & CEPH_CAP_FILE_CACHE)) && /* or revoking cache */ + !tried_invalidate) { + dout("check_caps trying to invalidate on %p\n", inode); + if (try_nonblocking_invalidate(inode) < 0) { + if (revoking & CEPH_CAP_FILE_CACHE) { + dout("check_caps queuing invalidate\n"); + queue_invalidate = 1; + ci->i_rdcache_revoking = ci->i_rdcache_gen; + } else { + dout("check_caps failed to invalidate pages\n"); + /* we failed to invalidate pages. check these + caps again later. */ + force_requeue = 1; + __cap_set_timeouts(mdsc, ci); + } + } + tried_invalidate = 1; + goto retry_locked; + } + + num = 0; + for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { + cap = rb_entry(p, struct ceph_cap, ci_node); + num++; + + /* avoid looping forever */ + if (mds >= cap->mds || + ((flags & CHECK_CAPS_AUTHONLY) && cap != ci->i_auth_cap)) + continue; + + /* NOTE: no side-effects allowed, until we take s_mutex */ + + revoking = cap->implemented & ~cap->issued; + if (revoking) + dout(" mds%d revoking %s\n", cap->mds, + ceph_cap_string(revoking)); + + if (cap == ci->i_auth_cap && + (cap->issued & CEPH_CAP_FILE_WR)) { + /* request larger max_size from MDS? */ + if (ci->i_wanted_max_size > ci->i_max_size && + ci->i_wanted_max_size > ci->i_requested_max_size) { + dout("requesting new max_size\n"); + goto ack; + } + + /* approaching file_max? */ + if ((inode->i_size << 1) >= ci->i_max_size && + (ci->i_reported_size << 1) < ci->i_max_size) { + dout("i_size approaching max_size\n"); + goto ack; + } + } + /* flush anything dirty? */ + if (cap == ci->i_auth_cap && (flags & CHECK_CAPS_FLUSH) && + ci->i_dirty_caps) { + dout("flushing dirty caps\n"); + goto ack; + } + + /* completed revocation? going down and there are no caps? */ + if (revoking && (revoking & used) == 0) { + dout("completed revocation of %s\n", + ceph_cap_string(cap->implemented & ~cap->issued)); + goto ack; + } + + /* want more caps from mds? */ + if (want & ~(cap->mds_wanted | cap->issued)) + goto ack; + + /* things we might delay */ + if ((cap->issued & ~retain) == 0 && + cap->mds_wanted == want) + continue; /* nope, all good */ + + if (is_delayed) + goto ack; + + /* delay? */ + if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 && + time_before(jiffies, ci->i_hold_caps_max)) { + dout(" delaying issued %s -> %s, wanted %s -> %s\n", + ceph_cap_string(cap->issued), + ceph_cap_string(cap->issued & retain), + ceph_cap_string(cap->mds_wanted), + ceph_cap_string(want)); + delayed++; + continue; + } + +ack: + if (ci->i_ceph_flags & CEPH_I_NOFLUSH) { + dout(" skipping %p I_NOFLUSH set\n", inode); + continue; + } + + if (session && session != cap->session) { + dout("oops, wrong session %p mutex\n", session); + mutex_unlock(&session->s_mutex); + session = NULL; + } + if (!session) { + session = cap->session; + if (mutex_trylock(&session->s_mutex) == 0) { + dout("inverting session/ino locks on %p\n", + session); + spin_unlock(&inode->i_lock); + if (took_snap_rwsem) { + up_read(&mdsc->snap_rwsem); + took_snap_rwsem = 0; + } + mutex_lock(&session->s_mutex); + goto retry; + } + } + /* take snap_rwsem after session mutex */ + if (!took_snap_rwsem) { + if (down_read_trylock(&mdsc->snap_rwsem) == 0) { + dout("inverting snap/in locks on %p\n", + inode); + spin_unlock(&inode->i_lock); + down_read(&mdsc->snap_rwsem); + took_snap_rwsem = 1; + goto retry; + } + took_snap_rwsem = 1; + } + + if (cap == ci->i_auth_cap && ci->i_dirty_caps) + flushing = __mark_caps_flushing(inode, session); + + mds = cap->mds; /* remember mds, so we don't repeat */ + sent++; + + /* __send_cap drops i_lock */ + delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want, + retain, flushing, NULL); + goto retry; /* retake i_lock and restart our cap scan. */ + } + + /* + * Reschedule delayed caps release if we delayed anything, + * otherwise cancel. + */ + if (delayed && is_delayed) + force_requeue = 1; /* __send_cap delayed release; requeue */ + if (!delayed && !is_delayed) + __cap_delay_cancel(mdsc, ci); + else if (!is_delayed || force_requeue) + __cap_delay_requeue(mdsc, ci); + + spin_unlock(&inode->i_lock); + + if (queue_invalidate) + ceph_queue_invalidate(inode); + + if (session && drop_session_lock) + mutex_unlock(&session->s_mutex); + if (took_snap_rwsem) + up_read(&mdsc->snap_rwsem); +} + +/* + * Try to flush dirty caps back to the auth mds. + */ +static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session, + unsigned *flush_tid) +{ + struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc; + struct ceph_inode_info *ci = ceph_inode(inode); + int unlock_session = session ? 0 : 1; + int flushing = 0; + +retry: + spin_lock(&inode->i_lock); + if (ci->i_ceph_flags & CEPH_I_NOFLUSH) { + dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode); + goto out; + } + if (ci->i_dirty_caps && ci->i_auth_cap) { + struct ceph_cap *cap = ci->i_auth_cap; + int used = __ceph_caps_used(ci); + int want = __ceph_caps_wanted(ci); + int delayed; + + if (!session) { + spin_unlock(&inode->i_lock); + session = cap->session; + mutex_lock(&session->s_mutex); + goto retry; + } + BUG_ON(session != cap->session); + if (cap->session->s_state < CEPH_MDS_SESSION_OPEN) + goto out; + + flushing = __mark_caps_flushing(inode, session); + + /* __send_cap drops i_lock */ + delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want, + cap->issued | cap->implemented, flushing, + flush_tid); + if (!delayed) + goto out_unlocked; + + spin_lock(&inode->i_lock); + __cap_delay_requeue(mdsc, ci); + } +out: + spin_unlock(&inode->i_lock); +out_unlocked: + if (session && unlock_session) + mutex_unlock(&session->s_mutex); + return flushing; +} + +/* + * Return true if we've flushed caps through the given flush_tid. + */ +static int caps_are_flushed(struct inode *inode, unsigned tid) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int dirty, i, ret = 1; + + spin_lock(&inode->i_lock); + dirty = __ceph_caps_dirty(ci); + for (i = 0; i < CEPH_CAP_BITS; i++) + if ((ci->i_flushing_caps & (1 << i)) && + ci->i_cap_flush_tid[i] <= tid) { + /* still flushing this bit */ + ret = 0; + break; + } + spin_unlock(&inode->i_lock); + return ret; +} + +/* + * Wait on any unsafe replies for the given inode. First wait on the + * newest request, and make that the upper bound. Then, if there are + * more requests, keep waiting on the oldest as long as it is still older + * than the original request. + */ +static void sync_write_wait(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct list_head *head = &ci->i_unsafe_writes; + struct ceph_osd_request *req; + u64 last_tid; + + spin_lock(&ci->i_unsafe_lock); + if (list_empty(head)) + goto out; + + /* set upper bound as _last_ entry in chain */ + req = list_entry(head->prev, struct ceph_osd_request, + r_unsafe_item); + last_tid = req->r_tid; + + do { + ceph_osdc_get_request(req); + spin_unlock(&ci->i_unsafe_lock); + dout("sync_write_wait on tid %llu (until %llu)\n", + req->r_tid, last_tid); + wait_for_completion(&req->r_safe_completion); + spin_lock(&ci->i_unsafe_lock); + ceph_osdc_put_request(req); + + /* + * from here on look at first entry in chain, since we + * only want to wait for anything older than last_tid + */ + if (list_empty(head)) + break; + req = list_entry(head->next, struct ceph_osd_request, + r_unsafe_item); + } while (req->r_tid < last_tid); +out: + spin_unlock(&ci->i_unsafe_lock); +} + +int ceph_fsync(struct file *file, struct dentry *dentry, int datasync) +{ + struct inode *inode = dentry->d_inode; + struct ceph_inode_info *ci = ceph_inode(inode); + unsigned flush_tid; + int ret; + int dirty; + + dout("fsync %p%s\n", inode, datasync ? " datasync" : ""); + sync_write_wait(inode); + + ret = filemap_write_and_wait(inode->i_mapping); + if (ret < 0) + return ret; + + dirty = try_flush_caps(inode, NULL, &flush_tid); + dout("fsync dirty caps are %s\n", ceph_cap_string(dirty)); + + /* + * only wait on non-file metadata writeback (the mds + * can recover size and mtime, so we don't need to + * wait for that) + */ + if (!datasync && (dirty & ~CEPH_CAP_ANY_FILE_WR)) { + dout("fsync waiting for flush_tid %u\n", flush_tid); + ret = wait_event_interruptible(ci->i_cap_wq, + caps_are_flushed(inode, flush_tid)); + } + + dout("fsync %p%s done\n", inode, datasync ? " datasync" : ""); + return ret; +} + +/* + * Flush any dirty caps back to the mds. If we aren't asked to wait, + * queue inode for flush but don't do so immediately, because we can + * get by with fewer MDS messages if we wait for data writeback to + * complete first. + */ +int ceph_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + unsigned flush_tid; + int err = 0; + int dirty; + int wait = wbc->sync_mode == WB_SYNC_ALL; + + dout("write_inode %p wait=%d\n", inode, wait); + if (wait) { + dirty = try_flush_caps(inode, NULL, &flush_tid); + if (dirty) + err = wait_event_interruptible(ci->i_cap_wq, + caps_are_flushed(inode, flush_tid)); + } else { + struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc; + + spin_lock(&inode->i_lock); + if (__ceph_caps_dirty(ci)) + __cap_delay_requeue_front(mdsc, ci); + spin_unlock(&inode->i_lock); + } + return err; +} + +/* + * After a recovering MDS goes active, we need to resend any caps + * we were flushing. + * + * Caller holds session->s_mutex. + */ +static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + struct ceph_cap_snap *capsnap; + + dout("kick_flushing_capsnaps mds%d\n", session->s_mds); + list_for_each_entry(capsnap, &session->s_cap_snaps_flushing, + flushing_item) { + struct ceph_inode_info *ci = capsnap->ci; + struct inode *inode = &ci->vfs_inode; + struct ceph_cap *cap; + + spin_lock(&inode->i_lock); + cap = ci->i_auth_cap; + if (cap && cap->session == session) { + dout("kick_flushing_caps %p cap %p capsnap %p\n", inode, + cap, capsnap); + __ceph_flush_snaps(ci, &session); + } else { + pr_err("%p auth cap %p not mds%d ???\n", inode, + cap, session->s_mds); + spin_unlock(&inode->i_lock); + } + } +} + +void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + struct ceph_inode_info *ci; + + kick_flushing_capsnaps(mdsc, session); + + dout("kick_flushing_caps mds%d\n", session->s_mds); + list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) { + struct inode *inode = &ci->vfs_inode; + struct ceph_cap *cap; + int delayed = 0; + + spin_lock(&inode->i_lock); + cap = ci->i_auth_cap; + if (cap && cap->session == session) { + dout("kick_flushing_caps %p cap %p %s\n", inode, + cap, ceph_cap_string(ci->i_flushing_caps)); + delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, + __ceph_caps_used(ci), + __ceph_caps_wanted(ci), + cap->issued | cap->implemented, + ci->i_flushing_caps, NULL); + if (delayed) { + spin_lock(&inode->i_lock); + __cap_delay_requeue(mdsc, ci); + spin_unlock(&inode->i_lock); + } + } else { + pr_err("%p auth cap %p not mds%d ???\n", inode, + cap, session->s_mds); + spin_unlock(&inode->i_lock); + } + } +} + + +/* + * Take references to capabilities we hold, so that we don't release + * them to the MDS prematurely. + * + * Protected by i_lock. + */ +static void __take_cap_refs(struct ceph_inode_info *ci, int got) +{ + if (got & CEPH_CAP_PIN) + ci->i_pin_ref++; + if (got & CEPH_CAP_FILE_RD) + ci->i_rd_ref++; + if (got & CEPH_CAP_FILE_CACHE) + ci->i_rdcache_ref++; + if (got & CEPH_CAP_FILE_WR) + ci->i_wr_ref++; + if (got & CEPH_CAP_FILE_BUFFER) { + if (ci->i_wrbuffer_ref == 0) + igrab(&ci->vfs_inode); + ci->i_wrbuffer_ref++; + dout("__take_cap_refs %p wrbuffer %d -> %d (?)\n", + &ci->vfs_inode, ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref); + } +} + +/* + * Try to grab cap references. Specify those refs we @want, and the + * minimal set we @need. Also include the larger offset we are writing + * to (when applicable), and check against max_size here as well. + * Note that caller is responsible for ensuring max_size increases are + * requested from the MDS. + */ +static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, + int *got, loff_t endoff, int *check_max, int *err) +{ + struct inode *inode = &ci->vfs_inode; + int ret = 0; + int have, implemented; + int file_wanted; + + dout("get_cap_refs %p need %s want %s\n", inode, + ceph_cap_string(need), ceph_cap_string(want)); + spin_lock(&inode->i_lock); + + /* make sure file is actually open */ + file_wanted = __ceph_caps_file_wanted(ci); + if ((file_wanted & need) == 0) { + dout("try_get_cap_refs need %s file_wanted %s, EBADF\n", + ceph_cap_string(need), ceph_cap_string(file_wanted)); + *err = -EBADF; + ret = 1; + goto out; + } + + if (need & CEPH_CAP_FILE_WR) { + if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) { + dout("get_cap_refs %p endoff %llu > maxsize %llu\n", + inode, endoff, ci->i_max_size); + if (endoff > ci->i_wanted_max_size) { + *check_max = 1; + ret = 1; + } + goto out; + } + /* + * If a sync write is in progress, we must wait, so that we + * can get a final snapshot value for size+mtime. + */ + if (__ceph_have_pending_cap_snap(ci)) { + dout("get_cap_refs %p cap_snap_pending\n", inode); + goto out; + } + } + have = __ceph_caps_issued(ci, &implemented); + + /* + * disallow writes while a truncate is pending + */ + if (ci->i_truncate_pending) + have &= ~CEPH_CAP_FILE_WR; + + if ((have & need) == need) { + /* + * Look at (implemented & ~have & not) so that we keep waiting + * on transition from wanted -> needed caps. This is needed + * for WRBUFFER|WR -> WR to avoid a new WR sync write from + * going before a prior buffered writeback happens. + */ + int not = want & ~(have & need); + int revoking = implemented & ~have; + dout("get_cap_refs %p have %s but not %s (revoking %s)\n", + inode, ceph_cap_string(have), ceph_cap_string(not), + ceph_cap_string(revoking)); + if ((revoking & not) == 0) { + *got = need | (have & want); + __take_cap_refs(ci, *got); + ret = 1; + } + } else { + dout("get_cap_refs %p have %s needed %s\n", inode, + ceph_cap_string(have), ceph_cap_string(need)); + } +out: + spin_unlock(&inode->i_lock); + dout("get_cap_refs %p ret %d got %s\n", inode, + ret, ceph_cap_string(*got)); + return ret; +} + +/* + * Check the offset we are writing up to against our current + * max_size. If necessary, tell the MDS we want to write to + * a larger offset. + */ +static void check_max_size(struct inode *inode, loff_t endoff) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int check = 0; + + /* do we need to explicitly request a larger max_size? */ + spin_lock(&inode->i_lock); + if ((endoff >= ci->i_max_size || + endoff > (inode->i_size << 1)) && + endoff > ci->i_wanted_max_size) { + dout("write %p at large endoff %llu, req max_size\n", + inode, endoff); + ci->i_wanted_max_size = endoff; + check = 1; + } + spin_unlock(&inode->i_lock); + if (check) + ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); +} + +/* + * Wait for caps, and take cap references. If we can't get a WR cap + * due to a small max_size, make sure we check_max_size (and possibly + * ask the mds) so we don't get hung up indefinitely. + */ +int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, int *got, + loff_t endoff) +{ + int check_max, ret, err; + +retry: + if (endoff > 0) + check_max_size(&ci->vfs_inode, endoff); + check_max = 0; + err = 0; + ret = wait_event_interruptible(ci->i_cap_wq, + try_get_cap_refs(ci, need, want, + got, endoff, + &check_max, &err)); + if (err) + ret = err; + if (check_max) + goto retry; + return ret; +} + +/* + * Take cap refs. Caller must already know we hold at least one ref + * on the caps in question or we don't know this is safe. + */ +void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps) +{ + spin_lock(&ci->vfs_inode.i_lock); + __take_cap_refs(ci, caps); + spin_unlock(&ci->vfs_inode.i_lock); +} + +/* + * Release cap refs. + * + * If we released the last ref on any given cap, call ceph_check_caps + * to release (or schedule a release). + * + * If we are releasing a WR cap (from a sync write), finalize any affected + * cap_snap, and wake up any waiters. + */ +void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) +{ + struct inode *inode = &ci->vfs_inode; + int last = 0, put = 0, flushsnaps = 0, wake = 0; + struct ceph_cap_snap *capsnap; + + spin_lock(&inode->i_lock); + if (had & CEPH_CAP_PIN) + --ci->i_pin_ref; + if (had & CEPH_CAP_FILE_RD) + if (--ci->i_rd_ref == 0) + last++; + if (had & CEPH_CAP_FILE_CACHE) + if (--ci->i_rdcache_ref == 0) + last++; + if (had & CEPH_CAP_FILE_BUFFER) { + if (--ci->i_wrbuffer_ref == 0) { + last++; + put++; + } + dout("put_cap_refs %p wrbuffer %d -> %d (?)\n", + inode, ci->i_wrbuffer_ref+1, ci->i_wrbuffer_ref); + } + if (had & CEPH_CAP_FILE_WR) + if (--ci->i_wr_ref == 0) { + last++; + if (!list_empty(&ci->i_cap_snaps)) { + capsnap = list_first_entry(&ci->i_cap_snaps, + struct ceph_cap_snap, + ci_item); + if (capsnap->writing) { + capsnap->writing = 0; + flushsnaps = + __ceph_finish_cap_snap(ci, + capsnap); + wake = 1; + } + } + } + spin_unlock(&inode->i_lock); + + dout("put_cap_refs %p had %s %s\n", inode, ceph_cap_string(had), + last ? "last" : ""); + + if (last && !flushsnaps) + ceph_check_caps(ci, 0, NULL); + else if (flushsnaps) + ceph_flush_snaps(ci); + if (wake) + wake_up(&ci->i_cap_wq); + if (put) + iput(inode); +} + +/* + * Release @nr WRBUFFER refs on dirty pages for the given @snapc snap + * context. Adjust per-snap dirty page accounting as appropriate. + * Once all dirty data for a cap_snap is flushed, flush snapped file + * metadata back to the MDS. If we dropped the last ref, call + * ceph_check_caps. + */ +void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, + struct ceph_snap_context *snapc) +{ + struct inode *inode = &ci->vfs_inode; + int last = 0; + int last_snap = 0; + int found = 0; + struct ceph_cap_snap *capsnap = NULL; + + spin_lock(&inode->i_lock); + ci->i_wrbuffer_ref -= nr; + last = !ci->i_wrbuffer_ref; + + if (ci->i_head_snapc == snapc) { + ci->i_wrbuffer_ref_head -= nr; + if (!ci->i_wrbuffer_ref_head) { + ceph_put_snap_context(ci->i_head_snapc); + ci->i_head_snapc = NULL; + } + dout("put_wrbuffer_cap_refs on %p head %d/%d -> %d/%d %s\n", + inode, + ci->i_wrbuffer_ref+nr, ci->i_wrbuffer_ref_head+nr, + ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head, + last ? " LAST" : ""); + } else { + list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { + if (capsnap->context == snapc) { + found = 1; + capsnap->dirty_pages -= nr; + last_snap = !capsnap->dirty_pages; + break; + } + } + BUG_ON(!found); + dout("put_wrbuffer_cap_refs on %p cap_snap %p " + " snap %lld %d/%d -> %d/%d %s%s\n", + inode, capsnap, capsnap->context->seq, + ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr, + ci->i_wrbuffer_ref, capsnap->dirty_pages, + last ? " (wrbuffer last)" : "", + last_snap ? " (capsnap last)" : ""); + } + + spin_unlock(&inode->i_lock); + + if (last) { + ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); + iput(inode); + } else if (last_snap) { + ceph_flush_snaps(ci); + wake_up(&ci->i_cap_wq); + } +} + +/* + * Handle a cap GRANT message from the MDS. (Note that a GRANT may + * actually be a revocation if it specifies a smaller cap set.) + * + * caller holds s_mutex. + * return value: + * 0 - ok + * 1 - check_caps on auth cap only (writeback) + * 2 - check_caps (ack revoke) + */ +static int handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, + struct ceph_mds_session *session, + struct ceph_cap *cap, + struct ceph_buffer *xattr_buf) + __releases(inode->i_lock) + +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int mds = session->s_mds; + int seq = le32_to_cpu(grant->seq); + int newcaps = le32_to_cpu(grant->caps); + int issued, implemented, used, wanted, dirty; + u64 size = le64_to_cpu(grant->size); + u64 max_size = le64_to_cpu(grant->max_size); + struct timespec mtime, atime, ctime; + int reply = 0; + int wake = 0; + int writeback = 0; + int revoked_rdcache = 0; + int queue_invalidate = 0; + + dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n", + inode, cap, mds, seq, ceph_cap_string(newcaps)); + dout(" size %llu max_size %llu, i_size %llu\n", size, max_size, + inode->i_size); + + /* + * If CACHE is being revoked, and we have no dirty buffers, + * try to invalidate (once). (If there are dirty buffers, we + * will invalidate _after_ writeback.) + */ + if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) && + !ci->i_wrbuffer_ref) { + if (try_nonblocking_invalidate(inode) == 0) { + revoked_rdcache = 1; + } else { + /* there were locked pages.. invalidate later + in a separate thread. */ + if (ci->i_rdcache_revoking != ci->i_rdcache_gen) { + queue_invalidate = 1; + ci->i_rdcache_revoking = ci->i_rdcache_gen; + } + } + } + + /* side effects now are allowed */ + + issued = __ceph_caps_issued(ci, &implemented); + issued |= implemented | __ceph_caps_dirty(ci); + + cap->cap_gen = session->s_cap_gen; + + __check_cap_issue(ci, cap, newcaps); + + if ((issued & CEPH_CAP_AUTH_EXCL) == 0) { + inode->i_mode = le32_to_cpu(grant->mode); + inode->i_uid = le32_to_cpu(grant->uid); + inode->i_gid = le32_to_cpu(grant->gid); + dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode, + inode->i_uid, inode->i_gid); + } + + if ((issued & CEPH_CAP_LINK_EXCL) == 0) + inode->i_nlink = le32_to_cpu(grant->nlink); + + if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) { + int len = le32_to_cpu(grant->xattr_len); + u64 version = le64_to_cpu(grant->xattr_version); + + if (version > ci->i_xattrs.version) { + dout(" got new xattrs v%llu on %p len %d\n", + version, inode, len); + if (ci->i_xattrs.blob) + ceph_buffer_put(ci->i_xattrs.blob); + ci->i_xattrs.blob = ceph_buffer_get(xattr_buf); + ci->i_xattrs.version = version; + } + } + + /* size/ctime/mtime/atime? */ + ceph_fill_file_size(inode, issued, + le32_to_cpu(grant->truncate_seq), + le64_to_cpu(grant->truncate_size), size); + ceph_decode_timespec(&mtime, &grant->mtime); + ceph_decode_timespec(&atime, &grant->atime); + ceph_decode_timespec(&ctime, &grant->ctime); + ceph_fill_file_time(inode, issued, + le32_to_cpu(grant->time_warp_seq), &ctime, &mtime, + &atime); + + /* max size increase? */ + if (max_size != ci->i_max_size) { + dout("max_size %lld -> %llu\n", ci->i_max_size, max_size); + ci->i_max_size = max_size; + if (max_size >= ci->i_wanted_max_size) { + ci->i_wanted_max_size = 0; /* reset */ + ci->i_requested_max_size = 0; + } + wake = 1; + } + + /* check cap bits */ + wanted = __ceph_caps_wanted(ci); + used = __ceph_caps_used(ci); + dirty = __ceph_caps_dirty(ci); + dout(" my wanted = %s, used = %s, dirty %s\n", + ceph_cap_string(wanted), + ceph_cap_string(used), + ceph_cap_string(dirty)); + if (wanted != le32_to_cpu(grant->wanted)) { + dout("mds wanted %s -> %s\n", + ceph_cap_string(le32_to_cpu(grant->wanted)), + ceph_cap_string(wanted)); + grant->wanted = cpu_to_le32(wanted); + } + + cap->seq = seq; + + /* file layout may have changed */ + ci->i_layout = grant->layout; + + /* revocation, grant, or no-op? */ + if (cap->issued & ~newcaps) { + dout("revocation: %s -> %s\n", ceph_cap_string(cap->issued), + ceph_cap_string(newcaps)); + if ((used & ~newcaps) & CEPH_CAP_FILE_BUFFER) + writeback = 1; /* will delay ack */ + else if (dirty & ~newcaps) + reply = 1; /* initiate writeback in check_caps */ + else if (((used & ~newcaps) & CEPH_CAP_FILE_CACHE) == 0 || + revoked_rdcache) + reply = 2; /* send revoke ack in check_caps */ + cap->issued = newcaps; + } else if (cap->issued == newcaps) { + dout("caps unchanged: %s -> %s\n", + ceph_cap_string(cap->issued), ceph_cap_string(newcaps)); + } else { + dout("grant: %s -> %s\n", ceph_cap_string(cap->issued), + ceph_cap_string(newcaps)); + cap->issued = newcaps; + cap->implemented |= newcaps; /* add bits only, to + * avoid stepping on a + * pending revocation */ + wake = 1; + } + + spin_unlock(&inode->i_lock); + if (writeback) + /* + * queue inode for writeback: we can't actually call + * filemap_write_and_wait, etc. from message handler + * context. + */ + ceph_queue_writeback(inode); + if (queue_invalidate) + ceph_queue_invalidate(inode); + if (wake) + wake_up(&ci->i_cap_wq); + return reply; +} + +/* + * Handle FLUSH_ACK from MDS, indicating that metadata we sent to the + * MDS has been safely committed. + */ +static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, + struct ceph_mds_caps *m, + struct ceph_mds_session *session, + struct ceph_cap *cap) + __releases(inode->i_lock) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc; + unsigned seq = le32_to_cpu(m->seq); + int dirty = le32_to_cpu(m->dirty); + int cleaned = 0; + int drop = 0; + int i; + + for (i = 0; i < CEPH_CAP_BITS; i++) + if ((dirty & (1 << i)) && + flush_tid == ci->i_cap_flush_tid[i]) + cleaned |= 1 << i; + + dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s," + " flushing %s -> %s\n", + inode, session->s_mds, seq, ceph_cap_string(dirty), + ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps), + ceph_cap_string(ci->i_flushing_caps & ~cleaned)); + + if (ci->i_flushing_caps == (ci->i_flushing_caps & ~cleaned)) + goto out; + + ci->i_flushing_caps &= ~cleaned; + + spin_lock(&mdsc->cap_dirty_lock); + if (ci->i_flushing_caps == 0) { + list_del_init(&ci->i_flushing_item); + if (!list_empty(&session->s_cap_flushing)) + dout(" mds%d still flushing cap on %p\n", + session->s_mds, + &list_entry(session->s_cap_flushing.next, + struct ceph_inode_info, + i_flushing_item)->vfs_inode); + mdsc->num_cap_flushing--; + wake_up(&mdsc->cap_flushing_wq); + dout(" inode %p now !flushing\n", inode); + + if (ci->i_dirty_caps == 0) { + dout(" inode %p now clean\n", inode); + BUG_ON(!list_empty(&ci->i_dirty_item)); + drop = 1; + } else { + BUG_ON(list_empty(&ci->i_dirty_item)); + } + } + spin_unlock(&mdsc->cap_dirty_lock); + wake_up(&ci->i_cap_wq); + +out: + spin_unlock(&inode->i_lock); + if (drop) + iput(inode); +} + +/* + * Handle FLUSHSNAP_ACK. MDS has flushed snap data to disk and we can + * throw away our cap_snap. + * + * Caller hold s_mutex. + */ +static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, + struct ceph_mds_caps *m, + struct ceph_mds_session *session) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + u64 follows = le64_to_cpu(m->snap_follows); + struct ceph_cap_snap *capsnap; + int drop = 0; + + dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n", + inode, ci, session->s_mds, follows); + + spin_lock(&inode->i_lock); + list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { + if (capsnap->follows == follows) { + if (capsnap->flush_tid != flush_tid) { + dout(" cap_snap %p follows %lld tid %lld !=" + " %lld\n", capsnap, follows, + flush_tid, capsnap->flush_tid); + break; + } + WARN_ON(capsnap->dirty_pages || capsnap->writing); + dout(" removing cap_snap %p follows %lld\n", + capsnap, follows); + ceph_put_snap_context(capsnap->context); + list_del(&capsnap->ci_item); + list_del(&capsnap->flushing_item); + ceph_put_cap_snap(capsnap); + drop = 1; + break; + } else { + dout(" skipping cap_snap %p follows %lld\n", + capsnap, capsnap->follows); + } + } + spin_unlock(&inode->i_lock); + if (drop) + iput(inode); +} + +/* + * Handle TRUNC from MDS, indicating file truncation. + * + * caller hold s_mutex. + */ +static void handle_cap_trunc(struct inode *inode, + struct ceph_mds_caps *trunc, + struct ceph_mds_session *session) + __releases(inode->i_lock) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int mds = session->s_mds; + int seq = le32_to_cpu(trunc->seq); + u32 truncate_seq = le32_to_cpu(trunc->truncate_seq); + u64 truncate_size = le64_to_cpu(trunc->truncate_size); + u64 size = le64_to_cpu(trunc->size); + int implemented = 0; + int dirty = __ceph_caps_dirty(ci); + int issued = __ceph_caps_issued(ceph_inode(inode), &implemented); + int queue_trunc = 0; + + issued |= implemented | dirty; + + dout("handle_cap_trunc inode %p mds%d seq %d to %lld seq %d\n", + inode, mds, seq, truncate_size, truncate_seq); + queue_trunc = ceph_fill_file_size(inode, issued, + truncate_seq, truncate_size, size); + spin_unlock(&inode->i_lock); + + if (queue_trunc) + ceph_queue_vmtruncate(inode); +} + +/* + * Handle EXPORT from MDS. Cap is being migrated _from_ this mds to a + * different one. If we are the most recent migration we've seen (as + * indicated by mseq), make note of the migrating cap bits for the + * duration (until we see the corresponding IMPORT). + * + * caller holds s_mutex + */ +static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, + struct ceph_mds_session *session) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int mds = session->s_mds; + unsigned mseq = le32_to_cpu(ex->migrate_seq); + struct ceph_cap *cap = NULL, *t; + struct rb_node *p; + int remember = 1; + + dout("handle_cap_export inode %p ci %p mds%d mseq %d\n", + inode, ci, mds, mseq); + + spin_lock(&inode->i_lock); + + /* make sure we haven't seen a higher mseq */ + for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { + t = rb_entry(p, struct ceph_cap, ci_node); + if (ceph_seq_cmp(t->mseq, mseq) > 0) { + dout(" higher mseq on cap from mds%d\n", + t->session->s_mds); + remember = 0; + } + if (t->session->s_mds == mds) + cap = t; + } + + if (cap) { + if (remember) { + /* make note */ + ci->i_cap_exporting_mds = mds; + ci->i_cap_exporting_mseq = mseq; + ci->i_cap_exporting_issued = cap->issued; + } + __ceph_remove_cap(cap); + } else { + WARN_ON(!cap); + } + + spin_unlock(&inode->i_lock); +} + +/* + * Handle cap IMPORT. If there are temp bits from an older EXPORT, + * clean them up. + * + * caller holds s_mutex. + */ +static void handle_cap_import(struct ceph_mds_client *mdsc, + struct inode *inode, struct ceph_mds_caps *im, + struct ceph_mds_session *session, + void *snaptrace, int snaptrace_len) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int mds = session->s_mds; + unsigned issued = le32_to_cpu(im->caps); + unsigned wanted = le32_to_cpu(im->wanted); + unsigned seq = le32_to_cpu(im->seq); + unsigned mseq = le32_to_cpu(im->migrate_seq); + u64 realmino = le64_to_cpu(im->realm); + u64 cap_id = le64_to_cpu(im->cap_id); + + if (ci->i_cap_exporting_mds >= 0 && + ceph_seq_cmp(ci->i_cap_exporting_mseq, mseq) < 0) { + dout("handle_cap_import inode %p ci %p mds%d mseq %d" + " - cleared exporting from mds%d\n", + inode, ci, mds, mseq, + ci->i_cap_exporting_mds); + ci->i_cap_exporting_issued = 0; + ci->i_cap_exporting_mseq = 0; + ci->i_cap_exporting_mds = -1; + } else { + dout("handle_cap_import inode %p ci %p mds%d mseq %d\n", + inode, ci, mds, mseq); + } + + down_write(&mdsc->snap_rwsem); + ceph_update_snap_trace(mdsc, snaptrace, snaptrace+snaptrace_len, + false); + downgrade_write(&mdsc->snap_rwsem); + ceph_add_cap(inode, session, cap_id, -1, + issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH, + NULL /* no caps context */); + try_flush_caps(inode, session, NULL); + up_read(&mdsc->snap_rwsem); +} + +/* + * Handle a caps message from the MDS. + * + * Identify the appropriate session, inode, and call the right handler + * based on the cap op. + */ +void ceph_handle_caps(struct ceph_mds_session *session, + struct ceph_msg *msg) +{ + struct ceph_mds_client *mdsc = session->s_mdsc; + struct super_block *sb = mdsc->client->sb; + struct inode *inode; + struct ceph_cap *cap; + struct ceph_mds_caps *h; + int mds = session->s_mds; + int op; + u32 seq; + struct ceph_vino vino; + u64 cap_id; + u64 size, max_size; + u64 tid; + int check_caps = 0; + void *snaptrace; + int r; + + dout("handle_caps from mds%d\n", mds); + + /* decode */ + tid = le64_to_cpu(msg->hdr.tid); + if (msg->front.iov_len < sizeof(*h)) + goto bad; + h = msg->front.iov_base; + snaptrace = h + 1; + op = le32_to_cpu(h->op); + vino.ino = le64_to_cpu(h->ino); + vino.snap = CEPH_NOSNAP; + cap_id = le64_to_cpu(h->cap_id); + seq = le32_to_cpu(h->seq); + size = le64_to_cpu(h->size); + max_size = le64_to_cpu(h->max_size); + + mutex_lock(&session->s_mutex); + session->s_seq++; + dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq, + (unsigned)seq); + + /* lookup ino */ + inode = ceph_find_inode(sb, vino); + dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino, + vino.snap, inode); + if (!inode) { + dout(" i don't have ino %llx\n", vino.ino); + goto done; + } + + /* these will work even if we don't have a cap yet */ + switch (op) { + case CEPH_CAP_OP_FLUSHSNAP_ACK: + handle_cap_flushsnap_ack(inode, tid, h, session); + goto done; + + case CEPH_CAP_OP_EXPORT: + handle_cap_export(inode, h, session); + goto done; + + case CEPH_CAP_OP_IMPORT: + handle_cap_import(mdsc, inode, h, session, + snaptrace, le32_to_cpu(h->snap_trace_len)); + check_caps = 1; /* we may have sent a RELEASE to the old auth */ + goto done; + } + + /* the rest require a cap */ + spin_lock(&inode->i_lock); + cap = __get_cap_for_mds(ceph_inode(inode), mds); + if (!cap) { + dout("no cap on %p ino %llx.%llx from mds%d, releasing\n", + inode, ceph_ino(inode), ceph_snap(inode), mds); + spin_unlock(&inode->i_lock); + goto done; + } + + /* note that each of these drops i_lock for us */ + switch (op) { + case CEPH_CAP_OP_REVOKE: + case CEPH_CAP_OP_GRANT: + r = handle_cap_grant(inode, h, session, cap, msg->middle); + if (r == 1) + ceph_check_caps(ceph_inode(inode), + CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY, + session); + else if (r == 2) + ceph_check_caps(ceph_inode(inode), + CHECK_CAPS_NODELAY, + session); + break; + + case CEPH_CAP_OP_FLUSH_ACK: + handle_cap_flush_ack(inode, tid, h, session, cap); + break; + + case CEPH_CAP_OP_TRUNC: + handle_cap_trunc(inode, h, session); + break; + + default: + spin_unlock(&inode->i_lock); + pr_err("ceph_handle_caps: unknown cap op %d %s\n", op, + ceph_cap_op_name(op)); + } + +done: + mutex_unlock(&session->s_mutex); + + if (check_caps) + ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY, NULL); + if (inode) + iput(inode); + return; + +bad: + pr_err("ceph_handle_caps: corrupt message\n"); + ceph_msg_dump(msg); + return; +} + +/* + * Delayed work handler to process end of delayed cap release LRU list. + */ +void ceph_check_delayed_caps(struct ceph_mds_client *mdsc) +{ + struct ceph_inode_info *ci; + int flags = CHECK_CAPS_NODELAY; + + dout("check_delayed_caps\n"); + while (1) { + spin_lock(&mdsc->cap_delay_lock); + if (list_empty(&mdsc->cap_delay_list)) + break; + ci = list_first_entry(&mdsc->cap_delay_list, + struct ceph_inode_info, + i_cap_delay_list); + if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 && + time_before(jiffies, ci->i_hold_caps_max)) + break; + list_del_init(&ci->i_cap_delay_list); + spin_unlock(&mdsc->cap_delay_lock); + dout("check_delayed_caps on %p\n", &ci->vfs_inode); + ceph_check_caps(ci, flags, NULL); + } + spin_unlock(&mdsc->cap_delay_lock); +} + +/* + * Flush all dirty caps to the mds + */ +void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc) +{ + struct ceph_inode_info *ci, *nci = NULL; + struct inode *inode, *ninode = NULL; + struct list_head *p, *n; + + dout("flush_dirty_caps\n"); + spin_lock(&mdsc->cap_dirty_lock); + list_for_each_safe(p, n, &mdsc->cap_dirty) { + if (nci) { + ci = nci; + inode = ninode; + ci->i_ceph_flags &= ~CEPH_I_NOFLUSH; + dout("flush_dirty_caps inode %p (was next inode)\n", + inode); + } else { + ci = list_entry(p, struct ceph_inode_info, + i_dirty_item); + inode = igrab(&ci->vfs_inode); + BUG_ON(!inode); + dout("flush_dirty_caps inode %p\n", inode); + } + if (n != &mdsc->cap_dirty) { + nci = list_entry(n, struct ceph_inode_info, + i_dirty_item); + ninode = igrab(&nci->vfs_inode); + BUG_ON(!ninode); + nci->i_ceph_flags |= CEPH_I_NOFLUSH; + dout("flush_dirty_caps next inode %p, noflush\n", + ninode); + } else { + nci = NULL; + ninode = NULL; + } + spin_unlock(&mdsc->cap_dirty_lock); + if (inode) { + ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH, + NULL); + iput(inode); + } + spin_lock(&mdsc->cap_dirty_lock); + } + spin_unlock(&mdsc->cap_dirty_lock); +} + +/* + * Drop open file reference. If we were the last open file, + * we may need to release capabilities to the MDS (or schedule + * their delayed release). + */ +void ceph_put_fmode(struct ceph_inode_info *ci, int fmode) +{ + struct inode *inode = &ci->vfs_inode; + int last = 0; + + spin_lock(&inode->i_lock); + dout("put_fmode %p fmode %d %d -> %d\n", inode, fmode, + ci->i_nr_by_mode[fmode], ci->i_nr_by_mode[fmode]-1); + BUG_ON(ci->i_nr_by_mode[fmode] == 0); + if (--ci->i_nr_by_mode[fmode] == 0) + last++; + spin_unlock(&inode->i_lock); + + if (last && ci->i_vino.snap == CEPH_NOSNAP) + ceph_check_caps(ci, 0, NULL); +} + +/* + * Helpers for embedding cap and dentry lease releases into mds + * requests. + * + * @force is used by dentry_release (below) to force inclusion of a + * record for the directory inode, even when there aren't any caps to + * drop. + */ +int ceph_encode_inode_release(void **p, struct inode *inode, + int mds, int drop, int unless, int force) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_cap *cap; + struct ceph_mds_request_release *rel = *p; + int ret = 0; + + dout("encode_inode_release %p mds%d drop %s unless %s\n", inode, + mds, ceph_cap_string(drop), ceph_cap_string(unless)); + + spin_lock(&inode->i_lock); + cap = __get_cap_for_mds(ci, mds); + if (cap && __cap_is_valid(cap)) { + if (force || + ((cap->issued & drop) && + (cap->issued & unless) == 0)) { + if ((cap->issued & drop) && + (cap->issued & unless) == 0) { + dout("encode_inode_release %p cap %p %s -> " + "%s\n", inode, cap, + ceph_cap_string(cap->issued), + ceph_cap_string(cap->issued & ~drop)); + cap->issued &= ~drop; + cap->implemented &= ~drop; + if (ci->i_ceph_flags & CEPH_I_NODELAY) { + int wanted = __ceph_caps_wanted(ci); + dout(" wanted %s -> %s (act %s)\n", + ceph_cap_string(cap->mds_wanted), + ceph_cap_string(cap->mds_wanted & + ~wanted), + ceph_cap_string(wanted)); + cap->mds_wanted &= wanted; + } + } else { + dout("encode_inode_release %p cap %p %s" + " (force)\n", inode, cap, + ceph_cap_string(cap->issued)); + } + + rel->ino = cpu_to_le64(ceph_ino(inode)); + rel->cap_id = cpu_to_le64(cap->cap_id); + rel->seq = cpu_to_le32(cap->seq); + rel->issue_seq = cpu_to_le32(cap->issue_seq), + rel->mseq = cpu_to_le32(cap->mseq); + rel->caps = cpu_to_le32(cap->issued); + rel->wanted = cpu_to_le32(cap->mds_wanted); + rel->dname_len = 0; + rel->dname_seq = 0; + *p += sizeof(*rel); + ret = 1; + } else { + dout("encode_inode_release %p cap %p %s\n", + inode, cap, ceph_cap_string(cap->issued)); + } + } + spin_unlock(&inode->i_lock); + return ret; +} + +int ceph_encode_dentry_release(void **p, struct dentry *dentry, + int mds, int drop, int unless) +{ + struct inode *dir = dentry->d_parent->d_inode; + struct ceph_mds_request_release *rel = *p; + struct ceph_dentry_info *di = ceph_dentry(dentry); + int force = 0; + int ret; + + /* + * force an record for the directory caps if we have a dentry lease. + * this is racy (can't take i_lock and d_lock together), but it + * doesn't have to be perfect; the mds will revoke anything we don't + * release. + */ + spin_lock(&dentry->d_lock); + if (di->lease_session && di->lease_session->s_mds == mds) + force = 1; + spin_unlock(&dentry->d_lock); + + ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force); + + spin_lock(&dentry->d_lock); + if (ret && di->lease_session && di->lease_session->s_mds == mds) { + dout("encode_dentry_release %p mds%d seq %d\n", + dentry, mds, (int)di->lease_seq); + rel->dname_len = cpu_to_le32(dentry->d_name.len); + memcpy(*p, dentry->d_name.name, dentry->d_name.len); + *p += dentry->d_name.len; + rel->dname_seq = cpu_to_le32(di->lease_seq); + } + spin_unlock(&dentry->d_lock); + return ret; +} diff --git a/fs/ceph/ceph_debug.h b/fs/ceph/ceph_debug.h new file mode 100644 index 00000000000..1818c230561 --- /dev/null +++ b/fs/ceph/ceph_debug.h @@ -0,0 +1,37 @@ +#ifndef _FS_CEPH_DEBUG_H +#define _FS_CEPH_DEBUG_H + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#ifdef CONFIG_CEPH_FS_PRETTYDEBUG + +/* + * wrap pr_debug to include a filename:lineno prefix on each line. + * this incurs some overhead (kernel size and execution time) due to + * the extra function call at each call site. + */ + +# if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG) +extern const char *ceph_file_part(const char *s, int len); +# define dout(fmt, ...) \ + pr_debug(" %12.12s:%-4d : " fmt, \ + ceph_file_part(__FILE__, sizeof(__FILE__)), \ + __LINE__, ##__VA_ARGS__) +# else +/* faux printk call just to see any compiler warnings. */ +# define dout(fmt, ...) do { \ + if (0) \ + printk(KERN_DEBUG fmt, ##__VA_ARGS__); \ + } while (0) +# endif + +#else + +/* + * or, just wrap pr_debug + */ +# define dout(fmt, ...) pr_debug(" " fmt, ##__VA_ARGS__) + +#endif + +#endif diff --git a/fs/ceph/ceph_frag.c b/fs/ceph/ceph_frag.c new file mode 100644 index 00000000000..ab6cf35c409 --- /dev/null +++ b/fs/ceph/ceph_frag.c @@ -0,0 +1,21 @@ +/* + * Ceph 'frag' type + */ +#include "types.h" + +int ceph_frag_compare(__u32 a, __u32 b) +{ + unsigned va = ceph_frag_value(a); + unsigned vb = ceph_frag_value(b); + if (va < vb) + return -1; + if (va > vb) + return 1; + va = ceph_frag_bits(a); + vb = ceph_frag_bits(b); + if (va < vb) + return -1; + if (va > vb) + return 1; + return 0; +} diff --git a/fs/ceph/ceph_frag.h b/fs/ceph/ceph_frag.h new file mode 100644 index 00000000000..793f50cb7c2 --- /dev/null +++ b/fs/ceph/ceph_frag.h @@ -0,0 +1,109 @@ +#ifndef _FS_CEPH_FRAG_H +#define _FS_CEPH_FRAG_H + +/* + * "Frags" are a way to describe a subset of a 32-bit number space, + * using a mask and a value to match against that mask. Any given frag + * (subset of the number space) can be partitioned into 2^n sub-frags. + * + * Frags are encoded into a 32-bit word: + * 8 upper bits = "bits" + * 24 lower bits = "value" + * (We could go to 5+27 bits, but who cares.) + * + * We use the _most_ significant bits of the 24 bit value. This makes + * values logically sort. + * + * Unfortunately, because the "bits" field is still in the high bits, we + * can't sort encoded frags numerically. However, it does allow you + * to feed encoded frags as values into frag_contains_value. + */ +static inline __u32 ceph_frag_make(__u32 b, __u32 v) +{ + return (b << 24) | + (v & (0xffffffu << (24-b)) & 0xffffffu); +} +static inline __u32 ceph_frag_bits(__u32 f) +{ + return f >> 24; +} +static inline __u32 ceph_frag_value(__u32 f) +{ + return f & 0xffffffu; +} +static inline __u32 ceph_frag_mask(__u32 f) +{ + return (0xffffffu << (24-ceph_frag_bits(f))) & 0xffffffu; +} +static inline __u32 ceph_frag_mask_shift(__u32 f) +{ + return 24 - ceph_frag_bits(f); +} + +static inline int ceph_frag_contains_value(__u32 f, __u32 v) +{ + return (v & ceph_frag_mask(f)) == ceph_frag_value(f); +} +static inline int ceph_frag_contains_frag(__u32 f, __u32 sub) +{ + /* is sub as specific as us, and contained by us? */ + return ceph_frag_bits(sub) >= ceph_frag_bits(f) && + (ceph_frag_value(sub) & ceph_frag_mask(f)) == ceph_frag_value(f); +} + +static inline __u32 ceph_frag_parent(__u32 f) +{ + return ceph_frag_make(ceph_frag_bits(f) - 1, + ceph_frag_value(f) & (ceph_frag_mask(f) << 1)); +} +static inline int ceph_frag_is_left_child(__u32 f) +{ + return ceph_frag_bits(f) > 0 && + (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 0; +} +static inline int ceph_frag_is_right_child(__u32 f) +{ + return ceph_frag_bits(f) > 0 && + (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 1; +} +static inline __u32 ceph_frag_sibling(__u32 f) +{ + return ceph_frag_make(ceph_frag_bits(f), + ceph_frag_value(f) ^ (0x1000000 >> ceph_frag_bits(f))); +} +static inline __u32 ceph_frag_left_child(__u32 f) +{ + return ceph_frag_make(ceph_frag_bits(f)+1, ceph_frag_value(f)); +} +static inline __u32 ceph_frag_right_child(__u32 f) +{ + return ceph_frag_make(ceph_frag_bits(f)+1, + ceph_frag_value(f) | (0x1000000 >> (1+ceph_frag_bits(f)))); +} +static inline __u32 ceph_frag_make_child(__u32 f, int by, int i) +{ + int newbits = ceph_frag_bits(f) + by; + return ceph_frag_make(newbits, + ceph_frag_value(f) | (i << (24 - newbits))); +} +static inline int ceph_frag_is_leftmost(__u32 f) +{ + return ceph_frag_value(f) == 0; +} +static inline int ceph_frag_is_rightmost(__u32 f) +{ + return ceph_frag_value(f) == ceph_frag_mask(f); +} +static inline __u32 ceph_frag_next(__u32 f) +{ + return ceph_frag_make(ceph_frag_bits(f), + ceph_frag_value(f) + (0x1000000 >> ceph_frag_bits(f))); +} + +/* + * comparator to sort frags logically, as when traversing the + * number space in ascending order... + */ +int ceph_frag_compare(__u32 a, __u32 b); + +#endif diff --git a/fs/ceph/ceph_fs.c b/fs/ceph/ceph_fs.c new file mode 100644 index 00000000000..79d76bc4303 --- /dev/null +++ b/fs/ceph/ceph_fs.c @@ -0,0 +1,74 @@ +/* + * Some non-inline ceph helpers + */ +#include "types.h" + +/* + * return true if @layout appears to be valid + */ +int ceph_file_layout_is_valid(const struct ceph_file_layout *layout) +{ + __u32 su = le32_to_cpu(layout->fl_stripe_unit); + __u32 sc = le32_to_cpu(layout->fl_stripe_count); + __u32 os = le32_to_cpu(layout->fl_object_size); + + /* stripe unit, object size must be non-zero, 64k increment */ + if (!su || (su & (CEPH_MIN_STRIPE_UNIT-1))) + return 0; + if (!os || (os & (CEPH_MIN_STRIPE_UNIT-1))) + return 0; + /* object size must be a multiple of stripe unit */ + if (os < su || os % su) + return 0; + /* stripe count must be non-zero */ + if (!sc) + return 0; + return 1; +} + + +int ceph_flags_to_mode(int flags) +{ +#ifdef O_DIRECTORY /* fixme */ + if ((flags & O_DIRECTORY) == O_DIRECTORY) + return CEPH_FILE_MODE_PIN; +#endif +#ifdef O_LAZY + if (flags & O_LAZY) + return CEPH_FILE_MODE_LAZY; +#endif + if ((flags & O_APPEND) == O_APPEND) + flags |= O_WRONLY; + + flags &= O_ACCMODE; + if ((flags & O_RDWR) == O_RDWR) + return CEPH_FILE_MODE_RDWR; + if ((flags & O_WRONLY) == O_WRONLY) + return CEPH_FILE_MODE_WR; + return CEPH_FILE_MODE_RD; +} + +int ceph_caps_for_mode(int mode) +{ + switch (mode) { + case CEPH_FILE_MODE_PIN: + return CEPH_CAP_PIN; + case CEPH_FILE_MODE_RD: + return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED | + CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE; + case CEPH_FILE_MODE_RDWR: + return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED | + CEPH_CAP_FILE_EXCL | + CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE | + CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER | + CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL | + CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL; + case CEPH_FILE_MODE_WR: + return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED | + CEPH_CAP_FILE_EXCL | + CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER | + CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL | + CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL; + } + return 0; +} diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h new file mode 100644 index 00000000000..0c2241ef365 --- /dev/null +++ b/fs/ceph/ceph_fs.h @@ -0,0 +1,650 @@ +/* + * ceph_fs.h - Ceph constants and data types to share between kernel and + * user space. + * + * Most types in this file are defined as little-endian, and are + * primarily intended to describe data structures that pass over the + * wire or that are stored on disk. + * + * LGPL2 + */ + +#ifndef _FS_CEPH_CEPH_FS_H +#define _FS_CEPH_CEPH_FS_H + +#include "msgr.h" +#include "rados.h" + +/* + * Ceph release version + */ +#define CEPH_VERSION_MAJOR 0 +#define CEPH_VERSION_MINOR 19 +#define CEPH_VERSION_PATCH 0 + +#define _CEPH_STRINGIFY(x) #x +#define CEPH_STRINGIFY(x) _CEPH_STRINGIFY(x) +#define CEPH_MAKE_VERSION(x, y, z) CEPH_STRINGIFY(x) "." CEPH_STRINGIFY(y) \ + "." CEPH_STRINGIFY(z) +#define CEPH_VERSION CEPH_MAKE_VERSION(CEPH_VERSION_MAJOR, \ + CEPH_VERSION_MINOR, CEPH_VERSION_PATCH) + +/* + * subprotocol versions. when specific messages types or high-level + * protocols change, bump the affected components. we keep rev + * internal cluster protocols separately from the public, + * client-facing protocol. + */ +#define CEPH_OSD_PROTOCOL 8 /* cluster internal */ +#define CEPH_MDS_PROTOCOL 9 /* cluster internal */ +#define CEPH_MON_PROTOCOL 5 /* cluster internal */ +#define CEPH_OSDC_PROTOCOL 24 /* server/client */ +#define CEPH_MDSC_PROTOCOL 32 /* server/client */ +#define CEPH_MONC_PROTOCOL 15 /* server/client */ + + +#define CEPH_INO_ROOT 1 +#define CEPH_INO_CEPH 2 /* hidden .ceph dir */ + +/* arbitrary limit on max # of monitors (cluster of 3 is typical) */ +#define CEPH_MAX_MON 31 + + +/* + * feature bits + */ +#define CEPH_FEATURE_SUPPORTED 0 +#define CEPH_FEATURE_REQUIRED 0 + + +/* + * ceph_file_layout - describe data layout for a file/inode + */ +struct ceph_file_layout { + /* file -> object mapping */ + __le32 fl_stripe_unit; /* stripe unit, in bytes. must be multiple + of page size. */ + __le32 fl_stripe_count; /* over this many objects */ + __le32 fl_object_size; /* until objects are this big, then move to + new objects */ + __le32 fl_cas_hash; /* 0 = none; 1 = sha256 */ + + /* pg -> disk layout */ + __le32 fl_object_stripe_unit; /* for per-object parity, if any */ + + /* object -> pg layout */ + __le32 fl_pg_preferred; /* preferred primary for pg (-1 for none) */ + __le32 fl_pg_pool; /* namespace, crush ruleset, rep level */ +} __attribute__ ((packed)); + +#define CEPH_MIN_STRIPE_UNIT 65536 + +int ceph_file_layout_is_valid(const struct ceph_file_layout *layout); + + +/* crypto algorithms */ +#define CEPH_CRYPTO_NONE 0x0 +#define CEPH_CRYPTO_AES 0x1 + +/* security/authentication protocols */ +#define CEPH_AUTH_UNKNOWN 0x0 +#define CEPH_AUTH_NONE 0x1 +#define CEPH_AUTH_CEPHX 0x2 + + +/********************************************* + * message layer + */ + +/* + * message types + */ + +/* misc */ +#define CEPH_MSG_SHUTDOWN 1 +#define CEPH_MSG_PING 2 + +/* client <-> monitor */ +#define CEPH_MSG_MON_MAP 4 +#define CEPH_MSG_MON_GET_MAP 5 +#define CEPH_MSG_STATFS 13 +#define CEPH_MSG_STATFS_REPLY 14 +#define CEPH_MSG_MON_SUBSCRIBE 15 +#define CEPH_MSG_MON_SUBSCRIBE_ACK 16 +#define CEPH_MSG_AUTH 17 +#define CEPH_MSG_AUTH_REPLY 18 + +/* client <-> mds */ +#define CEPH_MSG_MDS_MAP 21 + +#define CEPH_MSG_CLIENT_SESSION 22 +#define CEPH_MSG_CLIENT_RECONNECT 23 + +#define CEPH_MSG_CLIENT_REQUEST 24 +#define CEPH_MSG_CLIENT_REQUEST_FORWARD 25 +#define CEPH_MSG_CLIENT_REPLY 26 +#define CEPH_MSG_CLIENT_CAPS 0x310 +#define CEPH_MSG_CLIENT_LEASE 0x311 +#define CEPH_MSG_CLIENT_SNAP 0x312 +#define CEPH_MSG_CLIENT_CAPRELEASE 0x313 + +/* osd */ +#define CEPH_MSG_OSD_MAP 41 +#define CEPH_MSG_OSD_OP 42 +#define CEPH_MSG_OSD_OPREPLY 43 + +struct ceph_mon_request_header { + __le64 have_version; + __le16 session_mon; + __le64 session_mon_tid; +} __attribute__ ((packed)); + +struct ceph_mon_statfs { + struct ceph_mon_request_header monhdr; + struct ceph_fsid fsid; +} __attribute__ ((packed)); + +struct ceph_statfs { + __le64 kb, kb_used, kb_avail; + __le64 num_objects; +} __attribute__ ((packed)); + +struct ceph_mon_statfs_reply { + struct ceph_fsid fsid; + __le64 version; + struct ceph_statfs st; +} __attribute__ ((packed)); + +struct ceph_osd_getmap { + struct ceph_mon_request_header monhdr; + struct ceph_fsid fsid; + __le32 start; +} __attribute__ ((packed)); + +struct ceph_mds_getmap { + struct ceph_mon_request_header monhdr; + struct ceph_fsid fsid; +} __attribute__ ((packed)); + +struct ceph_client_mount { + struct ceph_mon_request_header monhdr; +} __attribute__ ((packed)); + +struct ceph_mon_subscribe_item { + __le64 have_version; __le64 have; + __u8 onetime; +} __attribute__ ((packed)); + +struct ceph_mon_subscribe_ack { + __le32 duration; /* seconds */ + struct ceph_fsid fsid; +} __attribute__ ((packed)); + +/* + * mds states + * > 0 -> in + * <= 0 -> out + */ +#define CEPH_MDS_STATE_DNE 0 /* down, does not exist. */ +#define CEPH_MDS_STATE_STOPPED -1 /* down, once existed, but no subtrees. + empty log. */ +#define CEPH_MDS_STATE_BOOT -4 /* up, boot announcement. */ +#define CEPH_MDS_STATE_STANDBY -5 /* up, idle. waiting for assignment. */ +#define CEPH_MDS_STATE_CREATING -6 /* up, creating MDS instance. */ +#define CEPH_MDS_STATE_STARTING -7 /* up, starting previously stopped mds */ +#define CEPH_MDS_STATE_STANDBY_REPLAY -8 /* up, tailing active node's journal */ + +#define CEPH_MDS_STATE_REPLAY 8 /* up, replaying journal. */ +#define CEPH_MDS_STATE_RESOLVE 9 /* up, disambiguating distributed + operations (import, rename, etc.) */ +#define CEPH_MDS_STATE_RECONNECT 10 /* up, reconnect to clients */ +#define CEPH_MDS_STATE_REJOIN 11 /* up, rejoining distributed cache */ +#define CEPH_MDS_STATE_CLIENTREPLAY 12 /* up, replaying client operations */ +#define CEPH_MDS_STATE_ACTIVE 13 /* up, active */ +#define CEPH_MDS_STATE_STOPPING 14 /* up, but exporting metadata */ + +extern const char *ceph_mds_state_name(int s); + + +/* + * metadata lock types. + * - these are bitmasks.. we can compose them + * - they also define the lock ordering by the MDS + * - a few of these are internal to the mds + */ +#define CEPH_LOCK_DN 1 +#define CEPH_LOCK_ISNAP 2 +#define CEPH_LOCK_IVERSION 4 /* mds internal */ +#define CEPH_LOCK_IFILE 8 /* mds internal */ +#define CEPH_LOCK_IAUTH 32 +#define CEPH_LOCK_ILINK 64 +#define CEPH_LOCK_IDFT 128 /* dir frag tree */ +#define CEPH_LOCK_INEST 256 /* mds internal */ +#define CEPH_LOCK_IXATTR 512 +#define CEPH_LOCK_INO 2048 /* immutable inode bits; not a lock */ + +/* client_session ops */ +enum { + CEPH_SESSION_REQUEST_OPEN, + CEPH_SESSION_OPEN, + CEPH_SESSION_REQUEST_CLOSE, + CEPH_SESSION_CLOSE, + CEPH_SESSION_REQUEST_RENEWCAPS, + CEPH_SESSION_RENEWCAPS, + CEPH_SESSION_STALE, + CEPH_SESSION_RECALL_STATE, +}; + +extern const char *ceph_session_op_name(int op); + +struct ceph_mds_session_head { + __le32 op; + __le64 seq; + struct ceph_timespec stamp; + __le32 max_caps, max_leases; +} __attribute__ ((packed)); + +/* client_request */ +/* + * metadata ops. + * & 0x001000 -> write op + * & 0x010000 -> follow symlink (e.g. stat(), not lstat()). + & & 0x100000 -> use weird ino/path trace + */ +#define CEPH_MDS_OP_WRITE 0x001000 +enum { + CEPH_MDS_OP_LOOKUP = 0x00100, + CEPH_MDS_OP_GETATTR = 0x00101, + CEPH_MDS_OP_LOOKUPHASH = 0x00102, + CEPH_MDS_OP_LOOKUPPARENT = 0x00103, + + CEPH_MDS_OP_SETXATTR = 0x01105, + CEPH_MDS_OP_RMXATTR = 0x01106, + CEPH_MDS_OP_SETLAYOUT = 0x01107, + CEPH_MDS_OP_SETATTR = 0x01108, + + CEPH_MDS_OP_MKNOD = 0x01201, + CEPH_MDS_OP_LINK = 0x01202, + CEPH_MDS_OP_UNLINK = 0x01203, + CEPH_MDS_OP_RENAME = 0x01204, + CEPH_MDS_OP_MKDIR = 0x01220, + CEPH_MDS_OP_RMDIR = 0x01221, + CEPH_MDS_OP_SYMLINK = 0x01222, + + CEPH_MDS_OP_CREATE = 0x01301, + CEPH_MDS_OP_OPEN = 0x00302, + CEPH_MDS_OP_READDIR = 0x00305, + + CEPH_MDS_OP_LOOKUPSNAP = 0x00400, + CEPH_MDS_OP_MKSNAP = 0x01400, + CEPH_MDS_OP_RMSNAP = 0x01401, + CEPH_MDS_OP_LSSNAP = 0x00402, +}; + +extern const char *ceph_mds_op_name(int op); + + +#define CEPH_SETATTR_MODE 1 +#define CEPH_SETATTR_UID 2 +#define CEPH_SETATTR_GID 4 +#define CEPH_SETATTR_MTIME 8 +#define CEPH_SETATTR_ATIME 16 +#define CEPH_SETATTR_SIZE 32 +#define CEPH_SETATTR_CTIME 64 + +union ceph_mds_request_args { + struct { + __le32 mask; /* CEPH_CAP_* */ + } __attribute__ ((packed)) getattr; + struct { + __le32 mode; + __le32 uid; + __le32 gid; + struct ceph_timespec mtime; + struct ceph_timespec atime; + __le64 size, old_size; /* old_size needed by truncate */ + __le32 mask; /* CEPH_SETATTR_* */ + } __attribute__ ((packed)) setattr; + struct { + __le32 frag; /* which dir fragment */ + __le32 max_entries; /* how many dentries to grab */ + } __attribute__ ((packed)) readdir; + struct { + __le32 mode; + __le32 rdev; + } __attribute__ ((packed)) mknod; + struct { + __le32 mode; + } __attribute__ ((packed)) mkdir; + struct { + __le32 flags; + __le32 mode; + __le32 stripe_unit; /* layout for newly created file */ + __le32 stripe_count; /* ... */ + __le32 object_size; + __le32 file_replication; + __le32 preferred; + } __attribute__ ((packed)) open; + struct { + __le32 flags; + } __attribute__ ((packed)) setxattr; + struct { + struct ceph_file_layout layout; + } __attribute__ ((packed)) setlayout; +} __attribute__ ((packed)); + +#define CEPH_MDS_FLAG_REPLAY 1 /* this is a replayed op */ +#define CEPH_MDS_FLAG_WANT_DENTRY 2 /* want dentry in reply */ + +struct ceph_mds_request_head { + __le64 oldest_client_tid; + __le32 mdsmap_epoch; /* on client */ + __le32 flags; /* CEPH_MDS_FLAG_* */ + __u8 num_retry, num_fwd; /* count retry, fwd attempts */ + __le16 num_releases; /* # include cap/lease release records */ + __le32 op; /* mds op code */ + __le32 caller_uid, caller_gid; + __le64 ino; /* use this ino for openc, mkdir, mknod, + etc. (if replaying) */ + union ceph_mds_request_args args; +} __attribute__ ((packed)); + +/* cap/lease release record */ +struct ceph_mds_request_release { + __le64 ino, cap_id; /* ino and unique cap id */ + __le32 caps, wanted; /* new issued, wanted */ + __le32 seq, issue_seq, mseq; + __le32 dname_seq; /* if releasing a dentry lease, a */ + __le32 dname_len; /* string follows. */ +} __attribute__ ((packed)); + +/* client reply */ +struct ceph_mds_reply_head { + __le32 op; + __le32 result; + __le32 mdsmap_epoch; + __u8 safe; /* true if committed to disk */ + __u8 is_dentry, is_target; /* true if dentry, target inode records + are included with reply */ +} __attribute__ ((packed)); + +/* one for each node split */ +struct ceph_frag_tree_split { + __le32 frag; /* this frag splits... */ + __le32 by; /* ...by this many bits */ +} __attribute__ ((packed)); + +struct ceph_frag_tree_head { + __le32 nsplits; /* num ceph_frag_tree_split records */ + struct ceph_frag_tree_split splits[]; +} __attribute__ ((packed)); + +/* capability issue, for bundling with mds reply */ +struct ceph_mds_reply_cap { + __le32 caps, wanted; /* caps issued, wanted */ + __le64 cap_id; + __le32 seq, mseq; + __le64 realm; /* snap realm */ + __u8 flags; /* CEPH_CAP_FLAG_* */ +} __attribute__ ((packed)); + +#define CEPH_CAP_FLAG_AUTH 1 /* cap is issued by auth mds */ + +/* inode record, for bundling with mds reply */ +struct ceph_mds_reply_inode { + __le64 ino; + __le64 snapid; + __le32 rdev; + __le64 version; /* inode version */ + __le64 xattr_version; /* version for xattr blob */ + struct ceph_mds_reply_cap cap; /* caps issued for this inode */ + struct ceph_file_layout layout; + struct ceph_timespec ctime, mtime, atime; + __le32 time_warp_seq; + __le64 size, max_size, truncate_size; + __le32 truncate_seq; + __le32 mode, uid, gid; + __le32 nlink; + __le64 files, subdirs, rbytes, rfiles, rsubdirs; /* dir stats */ + struct ceph_timespec rctime; + struct ceph_frag_tree_head fragtree; /* (must be at end of struct) */ +} __attribute__ ((packed)); +/* followed by frag array, then symlink string, then xattr blob */ + +/* reply_lease follows dname, and reply_inode */ +struct ceph_mds_reply_lease { + __le16 mask; /* lease type(s) */ + __le32 duration_ms; /* lease duration */ + __le32 seq; +} __attribute__ ((packed)); + +struct ceph_mds_reply_dirfrag { + __le32 frag; /* fragment */ + __le32 auth; /* auth mds, if this is a delegation point */ + __le32 ndist; /* number of mds' this is replicated on */ + __le32 dist[]; +} __attribute__ ((packed)); + +/* file access modes */ +#define CEPH_FILE_MODE_PIN 0 +#define CEPH_FILE_MODE_RD 1 +#define CEPH_FILE_MODE_WR 2 +#define CEPH_FILE_MODE_RDWR 3 /* RD | WR */ +#define CEPH_FILE_MODE_LAZY 4 /* lazy io */ +#define CEPH_FILE_MODE_NUM 8 /* bc these are bit fields.. mostly */ + +int ceph_flags_to_mode(int flags); + + +/* capability bits */ +#define CEPH_CAP_PIN 1 /* no specific capabilities beyond the pin */ + +/* generic cap bits */ +#define CEPH_CAP_GSHARED 1 /* client can reads */ +#define CEPH_CAP_GEXCL 2 /* client can read and update */ +#define CEPH_CAP_GCACHE 4 /* (file) client can cache reads */ +#define CEPH_CAP_GRD 8 /* (file) client can read */ +#define CEPH_CAP_GWR 16 /* (file) client can write */ +#define CEPH_CAP_GBUFFER 32 /* (file) client can buffer writes */ +#define CEPH_CAP_GWREXTEND 64 /* (file) client can extend EOF */ +#define CEPH_CAP_GLAZYIO 128 /* (file) client can perform lazy io */ + +/* per-lock shift */ +#define CEPH_CAP_SAUTH 2 +#define CEPH_CAP_SLINK 4 +#define CEPH_CAP_SXATTR 6 +#define CEPH_CAP_SFILE 8 /* goes at the end (uses >2 cap bits) */ + +#define CEPH_CAP_BITS 16 + +/* composed values */ +#define CEPH_CAP_AUTH_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SAUTH) +#define CEPH_CAP_AUTH_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SAUTH) +#define CEPH_CAP_LINK_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SLINK) +#define CEPH_CAP_LINK_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SLINK) +#define CEPH_CAP_XATTR_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SXATTR) +#define CEPH_CAP_XATTR_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SXATTR) +#define CEPH_CAP_FILE(x) (x << CEPH_CAP_SFILE) +#define CEPH_CAP_FILE_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SFILE) +#define CEPH_CAP_FILE_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SFILE) +#define CEPH_CAP_FILE_CACHE (CEPH_CAP_GCACHE << CEPH_CAP_SFILE) +#define CEPH_CAP_FILE_RD (CEPH_CAP_GRD << CEPH_CAP_SFILE) +#define CEPH_CAP_FILE_WR (CEPH_CAP_GWR << CEPH_CAP_SFILE) +#define CEPH_CAP_FILE_BUFFER (CEPH_CAP_GBUFFER << CEPH_CAP_SFILE) +#define CEPH_CAP_FILE_WREXTEND (CEPH_CAP_GWREXTEND << CEPH_CAP_SFILE) +#define CEPH_CAP_FILE_LAZYIO (CEPH_CAP_GLAZYIO << CEPH_CAP_SFILE) + +/* cap masks (for getattr) */ +#define CEPH_STAT_CAP_INODE CEPH_CAP_PIN +#define CEPH_STAT_CAP_TYPE CEPH_CAP_PIN /* mode >> 12 */ +#define CEPH_STAT_CAP_SYMLINK CEPH_CAP_PIN +#define CEPH_STAT_CAP_UID CEPH_CAP_AUTH_SHARED +#define CEPH_STAT_CAP_GID CEPH_CAP_AUTH_SHARED +#define CEPH_STAT_CAP_MODE CEPH_CAP_AUTH_SHARED +#define CEPH_STAT_CAP_NLINK CEPH_CAP_LINK_SHARED +#define CEPH_STAT_CAP_LAYOUT CEPH_CAP_FILE_SHARED +#define CEPH_STAT_CAP_MTIME CEPH_CAP_FILE_SHARED +#define CEPH_STAT_CAP_SIZE CEPH_CAP_FILE_SHARED +#define CEPH_STAT_CAP_ATIME CEPH_CAP_FILE_SHARED /* fixme */ +#define CEPH_STAT_CAP_XATTR CEPH_CAP_XATTR_SHARED +#define CEPH_STAT_CAP_INODE_ALL (CEPH_CAP_PIN | \ + CEPH_CAP_AUTH_SHARED | \ + CEPH_CAP_LINK_SHARED | \ + CEPH_CAP_FILE_SHARED | \ + CEPH_CAP_XATTR_SHARED) + +#define CEPH_CAP_ANY_SHARED (CEPH_CAP_AUTH_SHARED | \ + CEPH_CAP_LINK_SHARED | \ + CEPH_CAP_XATTR_SHARED | \ + CEPH_CAP_FILE_SHARED) +#define CEPH_CAP_ANY_RD (CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_RD | \ + CEPH_CAP_FILE_CACHE) + +#define CEPH_CAP_ANY_EXCL (CEPH_CAP_AUTH_EXCL | \ + CEPH_CAP_LINK_EXCL | \ + CEPH_CAP_XATTR_EXCL | \ + CEPH_CAP_FILE_EXCL) +#define CEPH_CAP_ANY_FILE_WR (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER | \ + CEPH_CAP_FILE_EXCL) +#define CEPH_CAP_ANY_WR (CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_FILE_WR) +#define CEPH_CAP_ANY (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \ + CEPH_CAP_ANY_FILE_WR | CEPH_CAP_PIN) + +#define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \ + CEPH_LOCK_IXATTR) + +int ceph_caps_for_mode(int mode); + +enum { + CEPH_CAP_OP_GRANT, /* mds->client grant */ + CEPH_CAP_OP_REVOKE, /* mds->client revoke */ + CEPH_CAP_OP_TRUNC, /* mds->client trunc notify */ + CEPH_CAP_OP_EXPORT, /* mds has exported the cap */ + CEPH_CAP_OP_IMPORT, /* mds has imported the cap */ + CEPH_CAP_OP_UPDATE, /* client->mds update */ + CEPH_CAP_OP_DROP, /* client->mds drop cap bits */ + CEPH_CAP_OP_FLUSH, /* client->mds cap writeback */ + CEPH_CAP_OP_FLUSH_ACK, /* mds->client flushed */ + CEPH_CAP_OP_FLUSHSNAP, /* client->mds flush snapped metadata */ + CEPH_CAP_OP_FLUSHSNAP_ACK, /* mds->client flushed snapped metadata */ + CEPH_CAP_OP_RELEASE, /* client->mds release (clean) cap */ + CEPH_CAP_OP_RENEW, /* client->mds renewal request */ +}; + +extern const char *ceph_cap_op_name(int op); + +/* + * caps message, used for capability callbacks, acks, requests, etc. + */ +struct ceph_mds_caps { + __le32 op; /* CEPH_CAP_OP_* */ + __le64 ino, realm; + __le64 cap_id; + __le32 seq, issue_seq; + __le32 caps, wanted, dirty; /* latest issued/wanted/dirty */ + __le32 migrate_seq; + __le64 snap_follows; + __le32 snap_trace_len; + + /* authlock */ + __le32 uid, gid, mode; + + /* linklock */ + __le32 nlink; + + /* xattrlock */ + __le32 xattr_len; + __le64 xattr_version; + + /* filelock */ + __le64 size, max_size, truncate_size; + __le32 truncate_seq; + struct ceph_timespec mtime, atime, ctime; + struct ceph_file_layout layout; + __le32 time_warp_seq; +} __attribute__ ((packed)); + +/* cap release msg head */ +struct ceph_mds_cap_release { + __le32 num; /* number of cap_items that follow */ +} __attribute__ ((packed)); + +struct ceph_mds_cap_item { + __le64 ino; + __le64 cap_id; + __le32 migrate_seq, seq; +} __attribute__ ((packed)); + +#define CEPH_MDS_LEASE_REVOKE 1 /* mds -> client */ +#define CEPH_MDS_LEASE_RELEASE 2 /* client -> mds */ +#define CEPH_MDS_LEASE_RENEW 3 /* client <-> mds */ +#define CEPH_MDS_LEASE_REVOKE_ACK 4 /* client -> mds */ + +extern const char *ceph_lease_op_name(int o); + +/* lease msg header */ +struct ceph_mds_lease { + __u8 action; /* CEPH_MDS_LEASE_* */ + __le16 mask; /* which lease */ + __le64 ino; + __le64 first, last; /* snap range */ + __le32 seq; + __le32 duration_ms; /* duration of renewal */ +} __attribute__ ((packed)); +/* followed by a __le32+string for dname */ + +/* client reconnect */ +struct ceph_mds_cap_reconnect { + __le64 cap_id; + __le32 wanted; + __le32 issued; + __le64 size; + struct ceph_timespec mtime, atime; + __le64 snaprealm; + __le64 pathbase; /* base ino for our path to this ino */ +} __attribute__ ((packed)); +/* followed by encoded string */ + +struct ceph_mds_snaprealm_reconnect { + __le64 ino; /* snap realm base */ + __le64 seq; /* snap seq for this snap realm */ + __le64 parent; /* parent realm */ +} __attribute__ ((packed)); + +/* + * snaps + */ +enum { + CEPH_SNAP_OP_UPDATE, /* CREATE or DESTROY */ + CEPH_SNAP_OP_CREATE, + CEPH_SNAP_OP_DESTROY, + CEPH_SNAP_OP_SPLIT, +}; + +extern const char *ceph_snap_op_name(int o); + +/* snap msg header */ +struct ceph_mds_snap_head { + __le32 op; /* CEPH_SNAP_OP_* */ + __le64 split; /* ino to split off, if any */ + __le32 num_split_inos; /* # inos belonging to new child realm */ + __le32 num_split_realms; /* # child realms udner new child realm */ + __le32 trace_len; /* size of snap trace blob */ +} __attribute__ ((packed)); +/* followed by split ino list, then split realms, then the trace blob */ + +/* + * encode info about a snaprealm, as viewed by a client + */ +struct ceph_mds_snap_realm { + __le64 ino; /* ino */ + __le64 created; /* snap: when created */ + __le64 parent; /* ino: parent realm */ + __le64 parent_since; /* snap: same parent since */ + __le64 seq; /* snap: version */ + __le32 num_snaps; + __le32 num_prior_parent_snaps; +} __attribute__ ((packed)); +/* followed by my snap list, then prior parent snap list */ + +#endif diff --git a/fs/ceph/ceph_hash.c b/fs/ceph/ceph_hash.c new file mode 100644 index 00000000000..bd570015d14 --- /dev/null +++ b/fs/ceph/ceph_hash.c @@ -0,0 +1,118 @@ + +#include "types.h" + +/* + * Robert Jenkin's hash function. + * http://burtleburtle.net/bob/hash/evahash.html + * This is in the public domain. + */ +#define mix(a, b, c) \ + do { \ + a = a - b; a = a - c; a = a ^ (c >> 13); \ + b = b - c; b = b - a; b = b ^ (a << 8); \ + c = c - a; c = c - b; c = c ^ (b >> 13); \ + a = a - b; a = a - c; a = a ^ (c >> 12); \ + b = b - c; b = b - a; b = b ^ (a << 16); \ + c = c - a; c = c - b; c = c ^ (b >> 5); \ + a = a - b; a = a - c; a = a ^ (c >> 3); \ + b = b - c; b = b - a; b = b ^ (a << 10); \ + c = c - a; c = c - b; c = c ^ (b >> 15); \ + } while (0) + +unsigned ceph_str_hash_rjenkins(const char *str, unsigned length) +{ + const unsigned char *k = (const unsigned char *)str; + __u32 a, b, c; /* the internal state */ + __u32 len; /* how many key bytes still need mixing */ + + /* Set up the internal state */ + len = length; + a = 0x9e3779b9; /* the golden ratio; an arbitrary value */ + b = a; + c = 0; /* variable initialization of internal state */ + + /* handle most of the key */ + while (len >= 12) { + a = a + (k[0] + ((__u32)k[1] << 8) + ((__u32)k[2] << 16) + + ((__u32)k[3] << 24)); + b = b + (k[4] + ((__u32)k[5] << 8) + ((__u32)k[6] << 16) + + ((__u32)k[7] << 24)); + c = c + (k[8] + ((__u32)k[9] << 8) + ((__u32)k[10] << 16) + + ((__u32)k[11] << 24)); + mix(a, b, c); + k = k + 12; + len = len - 12; + } + + /* handle the last 11 bytes */ + c = c + length; + switch (len) { /* all the case statements fall through */ + case 11: + c = c + ((__u32)k[10] << 24); + case 10: + c = c + ((__u32)k[9] << 16); + case 9: + c = c + ((__u32)k[8] << 8); + /* the first byte of c is reserved for the length */ + case 8: + b = b + ((__u32)k[7] << 24); + case 7: + b = b + ((__u32)k[6] << 16); + case 6: + b = b + ((__u32)k[5] << 8); + case 5: + b = b + k[4]; + case 4: + a = a + ((__u32)k[3] << 24); + case 3: + a = a + ((__u32)k[2] << 16); + case 2: + a = a + ((__u32)k[1] << 8); + case 1: + a = a + k[0]; + /* case 0: nothing left to add */ + } + mix(a, b, c); + + return c; +} + +/* + * linux dcache hash + */ +unsigned ceph_str_hash_linux(const char *str, unsigned length) +{ + unsigned long hash = 0; + unsigned char c; + + while (length--) { + c = *str++; + hash = (hash + (c << 4) + (c >> 4)) * 11; + } + return hash; +} + + +unsigned ceph_str_hash(int type, const char *s, unsigned len) +{ + switch (type) { + case CEPH_STR_HASH_LINUX: + return ceph_str_hash_linux(s, len); + case CEPH_STR_HASH_RJENKINS: + return ceph_str_hash_rjenkins(s, len); + default: + return -1; + } +} + +const char *ceph_str_hash_name(int type) +{ + switch (type) { + case CEPH_STR_HASH_LINUX: + return "linux"; + case CEPH_STR_HASH_RJENKINS: + return "rjenkins"; + default: + return "unknown"; + } +} diff --git a/fs/ceph/ceph_hash.h b/fs/ceph/ceph_hash.h new file mode 100644 index 00000000000..5ac470c433c --- /dev/null +++ b/fs/ceph/ceph_hash.h @@ -0,0 +1,13 @@ +#ifndef _FS_CEPH_HASH_H +#define _FS_CEPH_HASH_H + +#define CEPH_STR_HASH_LINUX 0x1 /* linux dcache hash */ +#define CEPH_STR_HASH_RJENKINS 0x2 /* robert jenkins' */ + +extern unsigned ceph_str_hash_linux(const char *s, unsigned len); +extern unsigned ceph_str_hash_rjenkins(const char *s, unsigned len); + +extern unsigned ceph_str_hash(int type, const char *s, unsigned len); +extern const char *ceph_str_hash_name(int type); + +#endif diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/ceph_strings.c new file mode 100644 index 00000000000..8e4be6a80c6 --- /dev/null +++ b/fs/ceph/ceph_strings.c @@ -0,0 +1,176 @@ +/* + * Ceph string constants + */ +#include "types.h" + +const char *ceph_entity_type_name(int type) +{ + switch (type) { + case CEPH_ENTITY_TYPE_MDS: return "mds"; + case CEPH_ENTITY_TYPE_OSD: return "osd"; + case CEPH_ENTITY_TYPE_MON: return "mon"; + case CEPH_ENTITY_TYPE_CLIENT: return "client"; + case CEPH_ENTITY_TYPE_ADMIN: return "admin"; + case CEPH_ENTITY_TYPE_AUTH: return "auth"; + default: return "unknown"; + } +} + +const char *ceph_osd_op_name(int op) +{ + switch (op) { + case CEPH_OSD_OP_READ: return "read"; + case CEPH_OSD_OP_STAT: return "stat"; + + case CEPH_OSD_OP_MASKTRUNC: return "masktrunc"; + + case CEPH_OSD_OP_WRITE: return "write"; + case CEPH_OSD_OP_DELETE: return "delete"; + case CEPH_OSD_OP_TRUNCATE: return "truncate"; + case CEPH_OSD_OP_ZERO: return "zero"; + case CEPH_OSD_OP_WRITEFULL: return "writefull"; + + case CEPH_OSD_OP_APPEND: return "append"; + case CEPH_OSD_OP_STARTSYNC: return "startsync"; + case CEPH_OSD_OP_SETTRUNC: return "settrunc"; + case CEPH_OSD_OP_TRIMTRUNC: return "trimtrunc"; + + case CEPH_OSD_OP_TMAPUP: return "tmapup"; + case CEPH_OSD_OP_TMAPGET: return "tmapget"; + case CEPH_OSD_OP_TMAPPUT: return "tmapput"; + + case CEPH_OSD_OP_GETXATTR: return "getxattr"; + case CEPH_OSD_OP_GETXATTRS: return "getxattrs"; + case CEPH_OSD_OP_SETXATTR: return "setxattr"; + case CEPH_OSD_OP_SETXATTRS: return "setxattrs"; + case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs"; + case CEPH_OSD_OP_RMXATTR: return "rmxattr"; + + case CEPH_OSD_OP_PULL: return "pull"; + case CEPH_OSD_OP_PUSH: return "push"; + case CEPH_OSD_OP_BALANCEREADS: return "balance-reads"; + case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads"; + case CEPH_OSD_OP_SCRUB: return "scrub"; + + case CEPH_OSD_OP_WRLOCK: return "wrlock"; + case CEPH_OSD_OP_WRUNLOCK: return "wrunlock"; + case CEPH_OSD_OP_RDLOCK: return "rdlock"; + case CEPH_OSD_OP_RDUNLOCK: return "rdunlock"; + case CEPH_OSD_OP_UPLOCK: return "uplock"; + case CEPH_OSD_OP_DNLOCK: return "dnlock"; + + case CEPH_OSD_OP_CALL: return "call"; + + case CEPH_OSD_OP_PGLS: return "pgls"; + } + return "???"; +} + +const char *ceph_mds_state_name(int s) +{ + switch (s) { + /* down and out */ + case CEPH_MDS_STATE_DNE: return "down:dne"; + case CEPH_MDS_STATE_STOPPED: return "down:stopped"; + /* up and out */ + case CEPH_MDS_STATE_BOOT: return "up:boot"; + case CEPH_MDS_STATE_STANDBY: return "up:standby"; + case CEPH_MDS_STATE_STANDBY_REPLAY: return "up:standby-replay"; + case CEPH_MDS_STATE_CREATING: return "up:creating"; + case CEPH_MDS_STATE_STARTING: return "up:starting"; + /* up and in */ + case CEPH_MDS_STATE_REPLAY: return "up:replay"; + case CEPH_MDS_STATE_RESOLVE: return "up:resolve"; + case CEPH_MDS_STATE_RECONNECT: return "up:reconnect"; + case CEPH_MDS_STATE_REJOIN: return "up:rejoin"; + case CEPH_MDS_STATE_CLIENTREPLAY: return "up:clientreplay"; + case CEPH_MDS_STATE_ACTIVE: return "up:active"; + case CEPH_MDS_STATE_STOPPING: return "up:stopping"; + } + return "???"; +} + +const char *ceph_session_op_name(int op) +{ + switch (op) { + case CEPH_SESSION_REQUEST_OPEN: return "request_open"; + case CEPH_SESSION_OPEN: return "open"; + case CEPH_SESSION_REQUEST_CLOSE: return "request_close"; + case CEPH_SESSION_CLOSE: return "close"; + case CEPH_SESSION_REQUEST_RENEWCAPS: return "request_renewcaps"; + case CEPH_SESSION_RENEWCAPS: return "renewcaps"; + case CEPH_SESSION_STALE: return "stale"; + case CEPH_SESSION_RECALL_STATE: return "recall_state"; + } + return "???"; +} + +const char *ceph_mds_op_name(int op) +{ + switch (op) { + case CEPH_MDS_OP_LOOKUP: return "lookup"; + case CEPH_MDS_OP_LOOKUPHASH: return "lookuphash"; + case CEPH_MDS_OP_LOOKUPPARENT: return "lookupparent"; + case CEPH_MDS_OP_GETATTR: return "getattr"; + case CEPH_MDS_OP_SETXATTR: return "setxattr"; + case CEPH_MDS_OP_SETATTR: return "setattr"; + case CEPH_MDS_OP_RMXATTR: return "rmxattr"; + case CEPH_MDS_OP_READDIR: return "readdir"; + case CEPH_MDS_OP_MKNOD: return "mknod"; + case CEPH_MDS_OP_LINK: return "link"; + case CEPH_MDS_OP_UNLINK: return "unlink"; + case CEPH_MDS_OP_RENAME: return "rename"; + case CEPH_MDS_OP_MKDIR: return "mkdir"; + case CEPH_MDS_OP_RMDIR: return "rmdir"; + case CEPH_MDS_OP_SYMLINK: return "symlink"; + case CEPH_MDS_OP_CREATE: return "create"; + case CEPH_MDS_OP_OPEN: return "open"; + case CEPH_MDS_OP_LOOKUPSNAP: return "lookupsnap"; + case CEPH_MDS_OP_LSSNAP: return "lssnap"; + case CEPH_MDS_OP_MKSNAP: return "mksnap"; + case CEPH_MDS_OP_RMSNAP: return "rmsnap"; + } + return "???"; +} + +const char *ceph_cap_op_name(int op) +{ + switch (op) { + case CEPH_CAP_OP_GRANT: return "grant"; + case CEPH_CAP_OP_REVOKE: return "revoke"; + case CEPH_CAP_OP_TRUNC: return "trunc"; + case CEPH_CAP_OP_EXPORT: return "export"; + case CEPH_CAP_OP_IMPORT: return "import"; + case CEPH_CAP_OP_UPDATE: return "update"; + case CEPH_CAP_OP_DROP: return "drop"; + case CEPH_CAP_OP_FLUSH: return "flush"; + case CEPH_CAP_OP_FLUSH_ACK: return "flush_ack"; + case CEPH_CAP_OP_FLUSHSNAP: return "flushsnap"; + case CEPH_CAP_OP_FLUSHSNAP_ACK: return "flushsnap_ack"; + case CEPH_CAP_OP_RELEASE: return "release"; + case CEPH_CAP_OP_RENEW: return "renew"; + } + return "???"; +} + +const char *ceph_lease_op_name(int o) +{ + switch (o) { + case CEPH_MDS_LEASE_REVOKE: return "revoke"; + case CEPH_MDS_LEASE_RELEASE: return "release"; + case CEPH_MDS_LEASE_RENEW: return "renew"; + case CEPH_MDS_LEASE_REVOKE_ACK: return "revoke_ack"; + } + return "???"; +} + +const char *ceph_snap_op_name(int o) +{ + switch (o) { + case CEPH_SNAP_OP_UPDATE: return "update"; + case CEPH_SNAP_OP_CREATE: return "create"; + case CEPH_SNAP_OP_DESTROY: return "destroy"; + case CEPH_SNAP_OP_SPLIT: return "split"; + } + return "???"; +} diff --git a/fs/ceph/crush/crush.c b/fs/ceph/crush/crush.c new file mode 100644 index 00000000000..fabd302e577 --- /dev/null +++ b/fs/ceph/crush/crush.c @@ -0,0 +1,151 @@ + +#ifdef __KERNEL__ +# include <linux/slab.h> +#else +# include <stdlib.h> +# include <assert.h> +# define kfree(x) do { if (x) free(x); } while (0) +# define BUG_ON(x) assert(!(x)) +#endif + +#include "crush.h" + +const char *crush_bucket_alg_name(int alg) +{ + switch (alg) { + case CRUSH_BUCKET_UNIFORM: return "uniform"; + case CRUSH_BUCKET_LIST: return "list"; + case CRUSH_BUCKET_TREE: return "tree"; + case CRUSH_BUCKET_STRAW: return "straw"; + default: return "unknown"; + } +} + +/** + * crush_get_bucket_item_weight - Get weight of an item in given bucket + * @b: bucket pointer + * @p: item index in bucket + */ +int crush_get_bucket_item_weight(struct crush_bucket *b, int p) +{ + if (p >= b->size) + return 0; + + switch (b->alg) { + case CRUSH_BUCKET_UNIFORM: + return ((struct crush_bucket_uniform *)b)->item_weight; + case CRUSH_BUCKET_LIST: + return ((struct crush_bucket_list *)b)->item_weights[p]; + case CRUSH_BUCKET_TREE: + if (p & 1) + return ((struct crush_bucket_tree *)b)->node_weights[p]; + return 0; + case CRUSH_BUCKET_STRAW: + return ((struct crush_bucket_straw *)b)->item_weights[p]; + } + return 0; +} + +/** + * crush_calc_parents - Calculate parent vectors for the given crush map. + * @map: crush_map pointer + */ +void crush_calc_parents(struct crush_map *map) +{ + int i, b, c; + + for (b = 0; b < map->max_buckets; b++) { + if (map->buckets[b] == NULL) + continue; + for (i = 0; i < map->buckets[b]->size; i++) { + c = map->buckets[b]->items[i]; + BUG_ON(c >= map->max_devices || + c < -map->max_buckets); + if (c >= 0) + map->device_parents[c] = map->buckets[b]->id; + else + map->bucket_parents[-1-c] = map->buckets[b]->id; + } + } +} + +void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b) +{ + kfree(b->h.perm); + kfree(b->h.items); + kfree(b); +} + +void crush_destroy_bucket_list(struct crush_bucket_list *b) +{ + kfree(b->item_weights); + kfree(b->sum_weights); + kfree(b->h.perm); + kfree(b->h.items); + kfree(b); +} + +void crush_destroy_bucket_tree(struct crush_bucket_tree *b) +{ + kfree(b->node_weights); + kfree(b); +} + +void crush_destroy_bucket_straw(struct crush_bucket_straw *b) +{ + kfree(b->straws); + kfree(b->item_weights); + kfree(b->h.perm); + kfree(b->h.items); + kfree(b); +} + +void crush_destroy_bucket(struct crush_bucket *b) +{ + switch (b->alg) { + case CRUSH_BUCKET_UNIFORM: + crush_destroy_bucket_uniform((struct crush_bucket_uniform *)b); + break; + case CRUSH_BUCKET_LIST: + crush_destroy_bucket_list((struct crush_bucket_list *)b); + break; + case CRUSH_BUCKET_TREE: + crush_destroy_bucket_tree((struct crush_bucket_tree *)b); + break; + case CRUSH_BUCKET_STRAW: + crush_destroy_bucket_straw((struct crush_bucket_straw *)b); + break; + } +} + +/** + * crush_destroy - Destroy a crush_map + * @map: crush_map pointer + */ +void crush_destroy(struct crush_map *map) +{ + int b; + + /* buckets */ + if (map->buckets) { + for (b = 0; b < map->max_buckets; b++) { + if (map->buckets[b] == NULL) + continue; + crush_destroy_bucket(map->buckets[b]); + } + kfree(map->buckets); + } + + /* rules */ + if (map->rules) { + for (b = 0; b < map->max_rules; b++) + kfree(map->rules[b]); + kfree(map->rules); + } + + kfree(map->bucket_parents); + kfree(map->device_parents); + kfree(map); +} + + diff --git a/fs/ceph/crush/crush.h b/fs/ceph/crush/crush.h new file mode 100644 index 00000000000..dcd7e752370 --- /dev/null +++ b/fs/ceph/crush/crush.h @@ -0,0 +1,180 @@ +#ifndef _CRUSH_CRUSH_H +#define _CRUSH_CRUSH_H + +#include <linux/types.h> + +/* + * CRUSH is a pseudo-random data distribution algorithm that + * efficiently distributes input values (typically, data objects) + * across a heterogeneous, structured storage cluster. + * + * The algorithm was originally described in detail in this paper + * (although the algorithm has evolved somewhat since then): + * + * http://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf + * + * LGPL2 + */ + + +#define CRUSH_MAGIC 0x00010000ul /* for detecting algorithm revisions */ + + +#define CRUSH_MAX_DEPTH 10 /* max crush hierarchy depth */ +#define CRUSH_MAX_SET 10 /* max size of a mapping result */ + + +/* + * CRUSH uses user-defined "rules" to describe how inputs should be + * mapped to devices. A rule consists of sequence of steps to perform + * to generate the set of output devices. + */ +struct crush_rule_step { + __u32 op; + __s32 arg1; + __s32 arg2; +}; + +/* step op codes */ +enum { + CRUSH_RULE_NOOP = 0, + CRUSH_RULE_TAKE = 1, /* arg1 = value to start with */ + CRUSH_RULE_CHOOSE_FIRSTN = 2, /* arg1 = num items to pick */ + /* arg2 = type */ + CRUSH_RULE_CHOOSE_INDEP = 3, /* same */ + CRUSH_RULE_EMIT = 4, /* no args */ + CRUSH_RULE_CHOOSE_LEAF_FIRSTN = 6, + CRUSH_RULE_CHOOSE_LEAF_INDEP = 7, +}; + +/* + * for specifying choose num (arg1) relative to the max parameter + * passed to do_rule + */ +#define CRUSH_CHOOSE_N 0 +#define CRUSH_CHOOSE_N_MINUS(x) (-(x)) + +/* + * The rule mask is used to describe what the rule is intended for. + * Given a ruleset and size of output set, we search through the + * rule list for a matching rule_mask. + */ +struct crush_rule_mask { + __u8 ruleset; + __u8 type; + __u8 min_size; + __u8 max_size; +}; + +struct crush_rule { + __u32 len; + struct crush_rule_mask mask; + struct crush_rule_step steps[0]; +}; + +#define crush_rule_size(len) (sizeof(struct crush_rule) + \ + (len)*sizeof(struct crush_rule_step)) + + + +/* + * A bucket is a named container of other items (either devices or + * other buckets). Items within a bucket are chosen using one of a + * few different algorithms. The table summarizes how the speed of + * each option measures up against mapping stability when items are + * added or removed. + * + * Bucket Alg Speed Additions Removals + * ------------------------------------------------ + * uniform O(1) poor poor + * list O(n) optimal poor + * tree O(log n) good good + * straw O(n) optimal optimal + */ +enum { + CRUSH_BUCKET_UNIFORM = 1, + CRUSH_BUCKET_LIST = 2, + CRUSH_BUCKET_TREE = 3, + CRUSH_BUCKET_STRAW = 4 +}; +extern const char *crush_bucket_alg_name(int alg); + +struct crush_bucket { + __s32 id; /* this'll be negative */ + __u16 type; /* non-zero; type=0 is reserved for devices */ + __u8 alg; /* one of CRUSH_BUCKET_* */ + __u8 hash; /* which hash function to use, CRUSH_HASH_* */ + __u32 weight; /* 16-bit fixed point */ + __u32 size; /* num items */ + __s32 *items; + + /* + * cached random permutation: used for uniform bucket and for + * the linear search fallback for the other bucket types. + */ + __u32 perm_x; /* @x for which *perm is defined */ + __u32 perm_n; /* num elements of *perm that are permuted/defined */ + __u32 *perm; +}; + +struct crush_bucket_uniform { + struct crush_bucket h; + __u32 item_weight; /* 16-bit fixed point; all items equally weighted */ +}; + +struct crush_bucket_list { + struct crush_bucket h; + __u32 *item_weights; /* 16-bit fixed point */ + __u32 *sum_weights; /* 16-bit fixed point. element i is sum + of weights 0..i, inclusive */ +}; + +struct crush_bucket_tree { + struct crush_bucket h; /* note: h.size is _tree_ size, not number of + actual items */ + __u8 num_nodes; + __u32 *node_weights; +}; + +struct crush_bucket_straw { + struct crush_bucket h; + __u32 *item_weights; /* 16-bit fixed point */ + __u32 *straws; /* 16-bit fixed point */ +}; + + + +/* + * CRUSH map includes all buckets, rules, etc. + */ +struct crush_map { + struct crush_bucket **buckets; + struct crush_rule **rules; + + /* + * Parent pointers to identify the parent bucket a device or + * bucket in the hierarchy. If an item appears more than + * once, this is the _last_ time it appeared (where buckets + * are processed in bucket id order, from -1 on down to + * -max_buckets. + */ + __u32 *bucket_parents; + __u32 *device_parents; + + __s32 max_buckets; + __u32 max_rules; + __s32 max_devices; +}; + + +/* crush.c */ +extern int crush_get_bucket_item_weight(struct crush_bucket *b, int pos); +extern void crush_calc_parents(struct crush_map *map); +extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b); +extern void crush_destroy_bucket_list(struct crush_bucket_list *b); +extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b); +extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b); +extern void crush_destroy_bucket(struct crush_bucket *b); +extern void crush_destroy(struct crush_map *map); + +#endif diff --git a/fs/ceph/crush/hash.c b/fs/ceph/crush/hash.c new file mode 100644 index 00000000000..5873aed694b --- /dev/null +++ b/fs/ceph/crush/hash.c @@ -0,0 +1,149 @@ + +#include <linux/types.h> +#include "hash.h" + +/* + * Robert Jenkins' function for mixing 32-bit values + * http://burtleburtle.net/bob/hash/evahash.html + * a, b = random bits, c = input and output + */ +#define crush_hashmix(a, b, c) do { \ + a = a-b; a = a-c; a = a^(c>>13); \ + b = b-c; b = b-a; b = b^(a<<8); \ + c = c-a; c = c-b; c = c^(b>>13); \ + a = a-b; a = a-c; a = a^(c>>12); \ + b = b-c; b = b-a; b = b^(a<<16); \ + c = c-a; c = c-b; c = c^(b>>5); \ + a = a-b; a = a-c; a = a^(c>>3); \ + b = b-c; b = b-a; b = b^(a<<10); \ + c = c-a; c = c-b; c = c^(b>>15); \ + } while (0) + +#define crush_hash_seed 1315423911 + +static __u32 crush_hash32_rjenkins1(__u32 a) +{ + __u32 hash = crush_hash_seed ^ a; + __u32 b = a; + __u32 x = 231232; + __u32 y = 1232; + crush_hashmix(b, x, hash); + crush_hashmix(y, a, hash); + return hash; +} + +static __u32 crush_hash32_rjenkins1_2(__u32 a, __u32 b) +{ + __u32 hash = crush_hash_seed ^ a ^ b; + __u32 x = 231232; + __u32 y = 1232; + crush_hashmix(a, b, hash); + crush_hashmix(x, a, hash); + crush_hashmix(b, y, hash); + return hash; +} + +static __u32 crush_hash32_rjenkins1_3(__u32 a, __u32 b, __u32 c) +{ + __u32 hash = crush_hash_seed ^ a ^ b ^ c; + __u32 x = 231232; + __u32 y = 1232; + crush_hashmix(a, b, hash); + crush_hashmix(c, x, hash); + crush_hashmix(y, a, hash); + crush_hashmix(b, x, hash); + crush_hashmix(y, c, hash); + return hash; +} + +static __u32 crush_hash32_rjenkins1_4(__u32 a, __u32 b, __u32 c, __u32 d) +{ + __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d; + __u32 x = 231232; + __u32 y = 1232; + crush_hashmix(a, b, hash); + crush_hashmix(c, d, hash); + crush_hashmix(a, x, hash); + crush_hashmix(y, b, hash); + crush_hashmix(c, x, hash); + crush_hashmix(y, d, hash); + return hash; +} + +static __u32 crush_hash32_rjenkins1_5(__u32 a, __u32 b, __u32 c, __u32 d, + __u32 e) +{ + __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d ^ e; + __u32 x = 231232; + __u32 y = 1232; + crush_hashmix(a, b, hash); + crush_hashmix(c, d, hash); + crush_hashmix(e, x, hash); + crush_hashmix(y, a, hash); + crush_hashmix(b, x, hash); + crush_hashmix(y, c, hash); + crush_hashmix(d, x, hash); + crush_hashmix(y, e, hash); + return hash; +} + + +__u32 crush_hash32(int type, __u32 a) +{ + switch (type) { + case CRUSH_HASH_RJENKINS1: + return crush_hash32_rjenkins1(a); + default: + return 0; + } +} + +__u32 crush_hash32_2(int type, __u32 a, __u32 b) +{ + switch (type) { + case CRUSH_HASH_RJENKINS1: + return crush_hash32_rjenkins1_2(a, b); + default: + return 0; + } +} + +__u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c) +{ + switch (type) { + case CRUSH_HASH_RJENKINS1: + return crush_hash32_rjenkins1_3(a, b, c); + default: + return 0; + } +} + +__u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d) +{ + switch (type) { + case CRUSH_HASH_RJENKINS1: + return crush_hash32_rjenkins1_4(a, b, c, d); + default: + return 0; + } +} + +__u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d, __u32 e) +{ + switch (type) { + case CRUSH_HASH_RJENKINS1: + return crush_hash32_rjenkins1_5(a, b, c, d, e); + default: + return 0; + } +} + +const char *crush_hash_name(int type) +{ + switch (type) { + case CRUSH_HASH_RJENKINS1: + return "rjenkins1"; + default: + return "unknown"; + } +} diff --git a/fs/ceph/crush/hash.h b/fs/ceph/crush/hash.h new file mode 100644 index 00000000000..ff48e110e4b --- /dev/null +++ b/fs/ceph/crush/hash.h @@ -0,0 +1,17 @@ +#ifndef _CRUSH_HASH_H +#define _CRUSH_HASH_H + +#define CRUSH_HASH_RJENKINS1 0 + +#define CRUSH_HASH_DEFAULT CRUSH_HASH_RJENKINS1 + +extern const char *crush_hash_name(int type); + +extern __u32 crush_hash32(int type, __u32 a); +extern __u32 crush_hash32_2(int type, __u32 a, __u32 b); +extern __u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c); +extern __u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d); +extern __u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d, + __u32 e); + +#endif diff --git a/fs/ceph/crush/mapper.c b/fs/ceph/crush/mapper.c new file mode 100644 index 00000000000..9ba54efb654 --- /dev/null +++ b/fs/ceph/crush/mapper.c @@ -0,0 +1,596 @@ + +#ifdef __KERNEL__ +# include <linux/string.h> +# include <linux/slab.h> +# include <linux/bug.h> +# include <linux/kernel.h> +# ifndef dprintk +# define dprintk(args...) +# endif +#else +# include <string.h> +# include <stdio.h> +# include <stdlib.h> +# include <assert.h> +# define BUG_ON(x) assert(!(x)) +# define dprintk(args...) /* printf(args) */ +# define kmalloc(x, f) malloc(x) +# define kfree(x) free(x) +#endif + +#include "crush.h" +#include "hash.h" + +/* + * Implement the core CRUSH mapping algorithm. + */ + +/** + * crush_find_rule - find a crush_rule id for a given ruleset, type, and size. + * @map: the crush_map + * @ruleset: the storage ruleset id (user defined) + * @type: storage ruleset type (user defined) + * @size: output set size + */ +int crush_find_rule(struct crush_map *map, int ruleset, int type, int size) +{ + int i; + + for (i = 0; i < map->max_rules; i++) { + if (map->rules[i] && + map->rules[i]->mask.ruleset == ruleset && + map->rules[i]->mask.type == type && + map->rules[i]->mask.min_size <= size && + map->rules[i]->mask.max_size >= size) + return i; + } + return -1; +} + + +/* + * bucket choose methods + * + * For each bucket algorithm, we have a "choose" method that, given a + * crush input @x and replica position (usually, position in output set) @r, + * will produce an item in the bucket. + */ + +/* + * Choose based on a random permutation of the bucket. + * + * We used to use some prime number arithmetic to do this, but it + * wasn't very random, and had some other bad behaviors. Instead, we + * calculate an actual random permutation of the bucket members. + * Since this is expensive, we optimize for the r=0 case, which + * captures the vast majority of calls. + */ +static int bucket_perm_choose(struct crush_bucket *bucket, + int x, int r) +{ + unsigned pr = r % bucket->size; + unsigned i, s; + + /* start a new permutation if @x has changed */ + if (bucket->perm_x != x || bucket->perm_n == 0) { + dprintk("bucket %d new x=%d\n", bucket->id, x); + bucket->perm_x = x; + + /* optimize common r=0 case */ + if (pr == 0) { + s = crush_hash32_3(bucket->hash, x, bucket->id, 0) % + bucket->size; + bucket->perm[0] = s; + bucket->perm_n = 0xffff; /* magic value, see below */ + goto out; + } + + for (i = 0; i < bucket->size; i++) + bucket->perm[i] = i; + bucket->perm_n = 0; + } else if (bucket->perm_n == 0xffff) { + /* clean up after the r=0 case above */ + for (i = 1; i < bucket->size; i++) + bucket->perm[i] = i; + bucket->perm[bucket->perm[0]] = 0; + bucket->perm_n = 1; + } + + /* calculate permutation up to pr */ + for (i = 0; i < bucket->perm_n; i++) + dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]); + while (bucket->perm_n <= pr) { + unsigned p = bucket->perm_n; + /* no point in swapping the final entry */ + if (p < bucket->size - 1) { + i = crush_hash32_3(bucket->hash, x, bucket->id, p) % + (bucket->size - p); + if (i) { + unsigned t = bucket->perm[p + i]; + bucket->perm[p + i] = bucket->perm[p]; + bucket->perm[p] = t; + } + dprintk(" perm_choose swap %d with %d\n", p, p+i); + } + bucket->perm_n++; + } + for (i = 0; i < bucket->size; i++) + dprintk(" perm_choose %d: %d\n", i, bucket->perm[i]); + + s = bucket->perm[pr]; +out: + dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id, + bucket->size, x, r, pr, s); + return bucket->items[s]; +} + +/* uniform */ +static int bucket_uniform_choose(struct crush_bucket_uniform *bucket, + int x, int r) +{ + return bucket_perm_choose(&bucket->h, x, r); +} + +/* list */ +static int bucket_list_choose(struct crush_bucket_list *bucket, + int x, int r) +{ + int i; + + for (i = bucket->h.size-1; i >= 0; i--) { + __u64 w = crush_hash32_4(bucket->h.hash,x, bucket->h.items[i], + r, bucket->h.id); + w &= 0xffff; + dprintk("list_choose i=%d x=%d r=%d item %d weight %x " + "sw %x rand %llx", + i, x, r, bucket->h.items[i], bucket->item_weights[i], + bucket->sum_weights[i], w); + w *= bucket->sum_weights[i]; + w = w >> 16; + /*dprintk(" scaled %llx\n", w);*/ + if (w < bucket->item_weights[i]) + return bucket->h.items[i]; + } + + BUG_ON(1); + return 0; +} + + +/* (binary) tree */ +static int height(int n) +{ + int h = 0; + while ((n & 1) == 0) { + h++; + n = n >> 1; + } + return h; +} + +static int left(int x) +{ + int h = height(x); + return x - (1 << (h-1)); +} + +static int right(int x) +{ + int h = height(x); + return x + (1 << (h-1)); +} + +static int terminal(int x) +{ + return x & 1; +} + +static int bucket_tree_choose(struct crush_bucket_tree *bucket, + int x, int r) +{ + int n, l; + __u32 w; + __u64 t; + + /* start at root */ + n = bucket->num_nodes >> 1; + + while (!terminal(n)) { + /* pick point in [0, w) */ + w = bucket->node_weights[n]; + t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r, + bucket->h.id) * (__u64)w; + t = t >> 32; + + /* descend to the left or right? */ + l = left(n); + if (t < bucket->node_weights[l]) + n = l; + else + n = right(n); + } + + return bucket->h.items[n >> 1]; +} + + +/* straw */ + +static int bucket_straw_choose(struct crush_bucket_straw *bucket, + int x, int r) +{ + int i; + int high = 0; + __u64 high_draw = 0; + __u64 draw; + + for (i = 0; i < bucket->h.size; i++) { + draw = crush_hash32_3(bucket->h.hash, x, bucket->h.items[i], r); + draw &= 0xffff; + draw *= bucket->straws[i]; + if (i == 0 || draw > high_draw) { + high = i; + high_draw = draw; + } + } + return bucket->h.items[high]; +} + +static int crush_bucket_choose(struct crush_bucket *in, int x, int r) +{ + dprintk("choose %d x=%d r=%d\n", in->id, x, r); + switch (in->alg) { + case CRUSH_BUCKET_UNIFORM: + return bucket_uniform_choose((struct crush_bucket_uniform *)in, + x, r); + case CRUSH_BUCKET_LIST: + return bucket_list_choose((struct crush_bucket_list *)in, + x, r); + case CRUSH_BUCKET_TREE: + return bucket_tree_choose((struct crush_bucket_tree *)in, + x, r); + case CRUSH_BUCKET_STRAW: + return bucket_straw_choose((struct crush_bucket_straw *)in, + x, r); + default: + BUG_ON(1); + return in->items[0]; + } +} + +/* + * true if device is marked "out" (failed, fully offloaded) + * of the cluster + */ +static int is_out(struct crush_map *map, __u32 *weight, int item, int x) +{ + if (weight[item] >= 0x1000) + return 0; + if (weight[item] == 0) + return 1; + if ((crush_hash32_2(CRUSH_HASH_RJENKINS1, x, item) & 0xffff) + < weight[item]) + return 0; + return 1; +} + +/** + * crush_choose - choose numrep distinct items of given type + * @map: the crush_map + * @bucket: the bucket we are choose an item from + * @x: crush input value + * @numrep: the number of items to choose + * @type: the type of item to choose + * @out: pointer to output vector + * @outpos: our position in that vector + * @firstn: true if choosing "first n" items, false if choosing "indep" + * @recurse_to_leaf: true if we want one device under each item of given type + * @out2: second output vector for leaf items (if @recurse_to_leaf) + */ +static int crush_choose(struct crush_map *map, + struct crush_bucket *bucket, + __u32 *weight, + int x, int numrep, int type, + int *out, int outpos, + int firstn, int recurse_to_leaf, + int *out2) +{ + int rep; + int ftotal, flocal; + int retry_descent, retry_bucket, skip_rep; + struct crush_bucket *in = bucket; + int r; + int i; + int item = 0; + int itemtype; + int collide, reject; + const int orig_tries = 5; /* attempts before we fall back to search */ + dprintk("choose bucket %d x %d outpos %d\n", bucket->id, x, outpos); + + for (rep = outpos; rep < numrep; rep++) { + /* keep trying until we get a non-out, non-colliding item */ + ftotal = 0; + skip_rep = 0; + do { + retry_descent = 0; + in = bucket; /* initial bucket */ + + /* choose through intervening buckets */ + flocal = 0; + do { + collide = 0; + retry_bucket = 0; + r = rep; + if (in->alg == CRUSH_BUCKET_UNIFORM) { + /* be careful */ + if (firstn || numrep >= in->size) + /* r' = r + f_total */ + r += ftotal; + else if (in->size % numrep == 0) + /* r'=r+(n+1)*f_local */ + r += (numrep+1) * + (flocal+ftotal); + else + /* r' = r + n*f_local */ + r += numrep * (flocal+ftotal); + } else { + if (firstn) + /* r' = r + f_total */ + r += ftotal; + else + /* r' = r + n*f_local */ + r += numrep * (flocal+ftotal); + } + + /* bucket choose */ + if (in->size == 0) { + reject = 1; + goto reject; + } + if (flocal >= (in->size>>1) && + flocal > orig_tries) + item = bucket_perm_choose(in, x, r); + else + item = crush_bucket_choose(in, x, r); + BUG_ON(item >= map->max_devices); + + /* desired type? */ + if (item < 0) + itemtype = map->buckets[-1-item]->type; + else + itemtype = 0; + dprintk(" item %d type %d\n", item, itemtype); + + /* keep going? */ + if (itemtype != type) { + BUG_ON(item >= 0 || + (-1-item) >= map->max_buckets); + in = map->buckets[-1-item]; + continue; + } + + /* collision? */ + for (i = 0; i < outpos; i++) { + if (out[i] == item) { + collide = 1; + break; + } + } + + if (recurse_to_leaf && + item < 0 && + crush_choose(map, map->buckets[-1-item], + weight, + x, outpos+1, 0, + out2, outpos, + firstn, 0, NULL) <= outpos) { + reject = 1; + } else { + /* out? */ + if (itemtype == 0) + reject = is_out(map, weight, + item, x); + else + reject = 0; + } + +reject: + if (reject || collide) { + ftotal++; + flocal++; + + if (collide && flocal < 3) + /* retry locally a few times */ + retry_bucket = 1; + else if (flocal < in->size + orig_tries) + /* exhaustive bucket search */ + retry_bucket = 1; + else if (ftotal < 20) + /* then retry descent */ + retry_descent = 1; + else + /* else give up */ + skip_rep = 1; + dprintk(" reject %d collide %d " + "ftotal %d flocal %d\n", + reject, collide, ftotal, + flocal); + } + } while (retry_bucket); + } while (retry_descent); + + if (skip_rep) { + dprintk("skip rep\n"); + continue; + } + + dprintk("choose got %d\n", item); + out[outpos] = item; + outpos++; + } + + dprintk("choose returns %d\n", outpos); + return outpos; +} + + +/** + * crush_do_rule - calculate a mapping with the given input and rule + * @map: the crush_map + * @ruleno: the rule id + * @x: hash input + * @result: pointer to result vector + * @result_max: maximum result size + * @force: force initial replica choice; -1 for none + */ +int crush_do_rule(struct crush_map *map, + int ruleno, int x, int *result, int result_max, + int force, __u32 *weight) +{ + int result_len; + int force_context[CRUSH_MAX_DEPTH]; + int force_pos = -1; + int a[CRUSH_MAX_SET]; + int b[CRUSH_MAX_SET]; + int c[CRUSH_MAX_SET]; + int recurse_to_leaf; + int *w; + int wsize = 0; + int *o; + int osize; + int *tmp; + struct crush_rule *rule; + int step; + int i, j; + int numrep; + int firstn; + int rc = -1; + + BUG_ON(ruleno >= map->max_rules); + + rule = map->rules[ruleno]; + result_len = 0; + w = a; + o = b; + + /* + * determine hierarchical context of force, if any. note + * that this may or may not correspond to the specific types + * referenced by the crush rule. + */ + if (force >= 0) { + if (force >= map->max_devices || + map->device_parents[force] == 0) { + /*dprintk("CRUSH: forcefed device dne\n");*/ + rc = -1; /* force fed device dne */ + goto out; + } + if (!is_out(map, weight, force, x)) { + while (1) { + force_context[++force_pos] = force; + if (force >= 0) + force = map->device_parents[force]; + else + force = map->bucket_parents[-1-force]; + if (force == 0) + break; + } + } + } + + for (step = 0; step < rule->len; step++) { + firstn = 0; + switch (rule->steps[step].op) { + case CRUSH_RULE_TAKE: + w[0] = rule->steps[step].arg1; + if (force_pos >= 0) { + BUG_ON(force_context[force_pos] != w[0]); + force_pos--; + } + wsize = 1; + break; + + case CRUSH_RULE_CHOOSE_LEAF_FIRSTN: + case CRUSH_RULE_CHOOSE_FIRSTN: + firstn = 1; + case CRUSH_RULE_CHOOSE_LEAF_INDEP: + case CRUSH_RULE_CHOOSE_INDEP: + BUG_ON(wsize == 0); + + recurse_to_leaf = + rule->steps[step].op == + CRUSH_RULE_CHOOSE_LEAF_FIRSTN || + rule->steps[step].op == + CRUSH_RULE_CHOOSE_LEAF_INDEP; + + /* reset output */ + osize = 0; + + for (i = 0; i < wsize; i++) { + /* + * see CRUSH_N, CRUSH_N_MINUS macros. + * basically, numrep <= 0 means relative to + * the provided result_max + */ + numrep = rule->steps[step].arg1; + if (numrep <= 0) { + numrep += result_max; + if (numrep <= 0) + continue; + } + j = 0; + if (osize == 0 && force_pos >= 0) { + /* skip any intermediate types */ + while (force_pos && + force_context[force_pos] < 0 && + rule->steps[step].arg2 != + map->buckets[-1 - + force_context[force_pos]]->type) + force_pos--; + o[osize] = force_context[force_pos]; + if (recurse_to_leaf) + c[osize] = force_context[0]; + j++; + force_pos--; + } + osize += crush_choose(map, + map->buckets[-1-w[i]], + weight, + x, numrep, + rule->steps[step].arg2, + o+osize, j, + firstn, + recurse_to_leaf, c+osize); + } + + if (recurse_to_leaf) + /* copy final _leaf_ values to output set */ + memcpy(o, c, osize*sizeof(*o)); + + /* swap t and w arrays */ + tmp = o; + o = w; + w = tmp; + wsize = osize; + break; + + + case CRUSH_RULE_EMIT: + for (i = 0; i < wsize && result_len < result_max; i++) { + result[result_len] = w[i]; + result_len++; + } + wsize = 0; + break; + + default: + BUG_ON(1); + } + } + rc = result_len; + +out: + return rc; +} + + diff --git a/fs/ceph/crush/mapper.h b/fs/ceph/crush/mapper.h new file mode 100644 index 00000000000..98e90046fd9 --- /dev/null +++ b/fs/ceph/crush/mapper.h @@ -0,0 +1,20 @@ +#ifndef _CRUSH_MAPPER_H +#define _CRUSH_MAPPER_H + +/* + * CRUSH functions for find rules and then mapping an input to an + * output set. + * + * LGPL2 + */ + +#include "crush.h" + +extern int crush_find_rule(struct crush_map *map, int pool, int type, int size); +extern int crush_do_rule(struct crush_map *map, + int ruleno, + int x, int *result, int result_max, + int forcefeed, /* -1 for none */ + __u32 *weights); + +#endif diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c new file mode 100644 index 00000000000..291ac288e79 --- /dev/null +++ b/fs/ceph/crypto.c @@ -0,0 +1,408 @@ + +#include "ceph_debug.h" + +#include <linux/err.h> +#include <linux/scatterlist.h> +#include <crypto/hash.h> + +#include "crypto.h" +#include "decode.h" + +int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end) +{ + if (*p + sizeof(u16) + sizeof(key->created) + + sizeof(u16) + key->len > end) + return -ERANGE; + ceph_encode_16(p, key->type); + ceph_encode_copy(p, &key->created, sizeof(key->created)); + ceph_encode_16(p, key->len); + ceph_encode_copy(p, key->key, key->len); + return 0; +} + +int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end) +{ + ceph_decode_need(p, end, 2*sizeof(u16) + sizeof(key->created), bad); + key->type = ceph_decode_16(p); + ceph_decode_copy(p, &key->created, sizeof(key->created)); + key->len = ceph_decode_16(p); + ceph_decode_need(p, end, key->len, bad); + key->key = kmalloc(key->len, GFP_NOFS); + if (!key->key) + return -ENOMEM; + ceph_decode_copy(p, key->key, key->len); + return 0; + +bad: + dout("failed to decode crypto key\n"); + return -EINVAL; +} + +int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey) +{ + int inlen = strlen(inkey); + int blen = inlen * 3 / 4; + void *buf, *p; + int ret; + + dout("crypto_key_unarmor %s\n", inkey); + buf = kmalloc(blen, GFP_NOFS); + if (!buf) + return -ENOMEM; + blen = ceph_unarmor(buf, inkey, inkey+inlen); + if (blen < 0) { + kfree(buf); + return blen; + } + + p = buf; + ret = ceph_crypto_key_decode(key, &p, p + blen); + kfree(buf); + if (ret) + return ret; + dout("crypto_key_unarmor key %p type %d len %d\n", key, + key->type, key->len); + return 0; +} + + + +#define AES_KEY_SIZE 16 + +static struct crypto_blkcipher *ceph_crypto_alloc_cipher(void) +{ + return crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC); +} + +const u8 *aes_iv = "cephsageyudagreg"; + +int ceph_aes_encrypt(const void *key, int key_len, void *dst, size_t *dst_len, + const void *src, size_t src_len) +{ + struct scatterlist sg_in[2], sg_out[1]; + struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher(); + struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 }; + int ret; + void *iv; + int ivsize; + size_t zero_padding = (0x10 - (src_len & 0x0f)); + char pad[16]; + + if (IS_ERR(tfm)) + return PTR_ERR(tfm); + + memset(pad, zero_padding, zero_padding); + + *dst_len = src_len + zero_padding; + + crypto_blkcipher_setkey((void *)tfm, key, key_len); + sg_init_table(sg_in, 2); + sg_set_buf(&sg_in[0], src, src_len); + sg_set_buf(&sg_in[1], pad, zero_padding); + sg_init_table(sg_out, 1); + sg_set_buf(sg_out, dst, *dst_len); + iv = crypto_blkcipher_crt(tfm)->iv; + ivsize = crypto_blkcipher_ivsize(tfm); + + memcpy(iv, aes_iv, ivsize); + /* + print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1, + key, key_len, 1); + print_hex_dump(KERN_ERR, "enc src: ", DUMP_PREFIX_NONE, 16, 1, + src, src_len, 1); + print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1, + pad, zero_padding, 1); + */ + ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in, + src_len + zero_padding); + crypto_free_blkcipher(tfm); + if (ret < 0) + pr_err("ceph_aes_crypt failed %d\n", ret); + /* + print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1, + dst, *dst_len, 1); + */ + return 0; +} + +int ceph_aes_encrypt2(const void *key, int key_len, void *dst, size_t *dst_len, + const void *src1, size_t src1_len, + const void *src2, size_t src2_len) +{ + struct scatterlist sg_in[3], sg_out[1]; + struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher(); + struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 }; + int ret; + void *iv; + int ivsize; + size_t zero_padding = (0x10 - ((src1_len + src2_len) & 0x0f)); + char pad[16]; + + if (IS_ERR(tfm)) + return PTR_ERR(tfm); + + memset(pad, zero_padding, zero_padding); + + *dst_len = src1_len + src2_len + zero_padding; + + crypto_blkcipher_setkey((void *)tfm, key, key_len); + sg_init_table(sg_in, 3); + sg_set_buf(&sg_in[0], src1, src1_len); + sg_set_buf(&sg_in[1], src2, src2_len); + sg_set_buf(&sg_in[2], pad, zero_padding); + sg_init_table(sg_out, 1); + sg_set_buf(sg_out, dst, *dst_len); + iv = crypto_blkcipher_crt(tfm)->iv; + ivsize = crypto_blkcipher_ivsize(tfm); + + memcpy(iv, aes_iv, ivsize); + /* + print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1, + key, key_len, 1); + print_hex_dump(KERN_ERR, "enc src1: ", DUMP_PREFIX_NONE, 16, 1, + src1, src1_len, 1); + print_hex_dump(KERN_ERR, "enc src2: ", DUMP_PREFIX_NONE, 16, 1, + src2, src2_len, 1); + print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1, + pad, zero_padding, 1); + */ + ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in, + src1_len + src2_len + zero_padding); + crypto_free_blkcipher(tfm); + if (ret < 0) + pr_err("ceph_aes_crypt2 failed %d\n", ret); + /* + print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1, + dst, *dst_len, 1); + */ + return 0; +} + +int ceph_aes_decrypt(const void *key, int key_len, void *dst, size_t *dst_len, + const void *src, size_t src_len) +{ + struct scatterlist sg_in[1], sg_out[2]; + struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher(); + struct blkcipher_desc desc = { .tfm = tfm }; + char pad[16]; + void *iv; + int ivsize; + int ret; + int last_byte; + + if (IS_ERR(tfm)) + return PTR_ERR(tfm); + + crypto_blkcipher_setkey((void *)tfm, key, key_len); + sg_init_table(sg_in, 1); + sg_init_table(sg_out, 2); + sg_set_buf(sg_in, src, src_len); + sg_set_buf(&sg_out[0], dst, *dst_len); + sg_set_buf(&sg_out[1], pad, sizeof(pad)); + + iv = crypto_blkcipher_crt(tfm)->iv; + ivsize = crypto_blkcipher_ivsize(tfm); + + memcpy(iv, aes_iv, ivsize); + + /* + print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1, + key, key_len, 1); + print_hex_dump(KERN_ERR, "dec in: ", DUMP_PREFIX_NONE, 16, 1, + src, src_len, 1); + */ + + ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len); + crypto_free_blkcipher(tfm); + if (ret < 0) { + pr_err("ceph_aes_decrypt failed %d\n", ret); + return ret; + } + + if (src_len <= *dst_len) + last_byte = ((char *)dst)[src_len - 1]; + else + last_byte = pad[src_len - *dst_len - 1]; + if (last_byte <= 16 && src_len >= last_byte) { + *dst_len = src_len - last_byte; + } else { + pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n", + last_byte, (int)src_len); + return -EPERM; /* bad padding */ + } + /* + print_hex_dump(KERN_ERR, "dec out: ", DUMP_PREFIX_NONE, 16, 1, + dst, *dst_len, 1); + */ + return 0; +} + +int ceph_aes_decrypt2(const void *key, int key_len, + void *dst1, size_t *dst1_len, + void *dst2, size_t *dst2_len, + const void *src, size_t src_len) +{ + struct scatterlist sg_in[1], sg_out[3]; + struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher(); + struct blkcipher_desc desc = { .tfm = tfm }; + char pad[16]; + void *iv; + int ivsize; + int ret; + int last_byte; + + if (IS_ERR(tfm)) + return PTR_ERR(tfm); + + sg_init_table(sg_in, 1); + sg_set_buf(sg_in, src, src_len); + sg_init_table(sg_out, 3); + sg_set_buf(&sg_out[0], dst1, *dst1_len); + sg_set_buf(&sg_out[1], dst2, *dst2_len); + sg_set_buf(&sg_out[2], pad, sizeof(pad)); + + crypto_blkcipher_setkey((void *)tfm, key, key_len); + iv = crypto_blkcipher_crt(tfm)->iv; + ivsize = crypto_blkcipher_ivsize(tfm); + + memcpy(iv, aes_iv, ivsize); + + /* + print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1, + key, key_len, 1); + print_hex_dump(KERN_ERR, "dec in: ", DUMP_PREFIX_NONE, 16, 1, + src, src_len, 1); + */ + + ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len); + crypto_free_blkcipher(tfm); + if (ret < 0) { + pr_err("ceph_aes_decrypt failed %d\n", ret); + return ret; + } + + if (src_len <= *dst1_len) + last_byte = ((char *)dst1)[src_len - 1]; + else if (src_len <= *dst1_len + *dst2_len) + last_byte = ((char *)dst2)[src_len - *dst1_len - 1]; + else + last_byte = pad[src_len - *dst1_len - *dst2_len - 1]; + if (last_byte <= 16 && src_len >= last_byte) { + src_len -= last_byte; + } else { + pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n", + last_byte, (int)src_len); + return -EPERM; /* bad padding */ + } + + if (src_len < *dst1_len) { + *dst1_len = src_len; + *dst2_len = 0; + } else { + *dst2_len = src_len - *dst1_len; + } + /* + print_hex_dump(KERN_ERR, "dec out1: ", DUMP_PREFIX_NONE, 16, 1, + dst1, *dst1_len, 1); + print_hex_dump(KERN_ERR, "dec out2: ", DUMP_PREFIX_NONE, 16, 1, + dst2, *dst2_len, 1); + */ + + return 0; +} + + +int ceph_decrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len, + const void *src, size_t src_len) +{ + switch (secret->type) { + case CEPH_CRYPTO_NONE: + if (*dst_len < src_len) + return -ERANGE; + memcpy(dst, src, src_len); + *dst_len = src_len; + return 0; + + case CEPH_CRYPTO_AES: + return ceph_aes_decrypt(secret->key, secret->len, dst, + dst_len, src, src_len); + + default: + return -EINVAL; + } +} + +int ceph_decrypt2(struct ceph_crypto_key *secret, + void *dst1, size_t *dst1_len, + void *dst2, size_t *dst2_len, + const void *src, size_t src_len) +{ + size_t t; + + switch (secret->type) { + case CEPH_CRYPTO_NONE: + if (*dst1_len + *dst2_len < src_len) + return -ERANGE; + t = min(*dst1_len, src_len); + memcpy(dst1, src, t); + *dst1_len = t; + src += t; + src_len -= t; + if (src_len) { + t = min(*dst2_len, src_len); + memcpy(dst2, src, t); + *dst2_len = t; + } + return 0; + + case CEPH_CRYPTO_AES: + return ceph_aes_decrypt2(secret->key, secret->len, + dst1, dst1_len, dst2, dst2_len, + src, src_len); + + default: + return -EINVAL; + } +} + +int ceph_encrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len, + const void *src, size_t src_len) +{ + switch (secret->type) { + case CEPH_CRYPTO_NONE: + if (*dst_len < src_len) + return -ERANGE; + memcpy(dst, src, src_len); + *dst_len = src_len; + return 0; + + case CEPH_CRYPTO_AES: + return ceph_aes_encrypt(secret->key, secret->len, dst, + dst_len, src, src_len); + + default: + return -EINVAL; + } +} + +int ceph_encrypt2(struct ceph_crypto_key *secret, void *dst, size_t *dst_len, + const void *src1, size_t src1_len, + const void *src2, size_t src2_len) +{ + switch (secret->type) { + case CEPH_CRYPTO_NONE: + if (*dst_len < src1_len + src2_len) + return -ERANGE; + memcpy(dst, src1, src1_len); + memcpy(dst + src1_len, src2, src2_len); + *dst_len = src1_len + src2_len; + return 0; + + case CEPH_CRYPTO_AES: + return ceph_aes_encrypt2(secret->key, secret->len, dst, dst_len, + src1, src1_len, src2, src2_len); + + default: + return -EINVAL; + } +} diff --git a/fs/ceph/crypto.h b/fs/ceph/crypto.h new file mode 100644 index 00000000000..40b502e6bd8 --- /dev/null +++ b/fs/ceph/crypto.h @@ -0,0 +1,48 @@ +#ifndef _FS_CEPH_CRYPTO_H +#define _FS_CEPH_CRYPTO_H + +#include "types.h" +#include "buffer.h" + +/* + * cryptographic secret + */ +struct ceph_crypto_key { + int type; + struct ceph_timespec created; + int len; + void *key; +}; + +static inline void ceph_crypto_key_destroy(struct ceph_crypto_key *key) +{ + kfree(key->key); +} + +extern int ceph_crypto_key_encode(struct ceph_crypto_key *key, + void **p, void *end); +extern int ceph_crypto_key_decode(struct ceph_crypto_key *key, + void **p, void *end); +extern int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *in); + +/* crypto.c */ +extern int ceph_decrypt(struct ceph_crypto_key *secret, + void *dst, size_t *dst_len, + const void *src, size_t src_len); +extern int ceph_encrypt(struct ceph_crypto_key *secret, + void *dst, size_t *dst_len, + const void *src, size_t src_len); +extern int ceph_decrypt2(struct ceph_crypto_key *secret, + void *dst1, size_t *dst1_len, + void *dst2, size_t *dst2_len, + const void *src, size_t src_len); +extern int ceph_encrypt2(struct ceph_crypto_key *secret, + void *dst, size_t *dst_len, + const void *src1, size_t src1_len, + const void *src2, size_t src2_len); + +/* armor.c */ +extern int ceph_armor(char *dst, const void *src, const void *end); +extern int ceph_unarmor(void *dst, const char *src, const char *end); + +#endif diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c new file mode 100644 index 00000000000..e159f141511 --- /dev/null +++ b/fs/ceph/debugfs.c @@ -0,0 +1,483 @@ +#include "ceph_debug.h" + +#include <linux/device.h> +#include <linux/module.h> +#include <linux/ctype.h> +#include <linux/debugfs.h> +#include <linux/seq_file.h> + +#include "super.h" +#include "mds_client.h" +#include "mon_client.h" +#include "auth.h" + +#ifdef CONFIG_DEBUG_FS + +/* + * Implement /sys/kernel/debug/ceph fun + * + * /sys/kernel/debug/ceph/client* - an instance of the ceph client + * .../osdmap - current osdmap + * .../mdsmap - current mdsmap + * .../monmap - current monmap + * .../osdc - active osd requests + * .../mdsc - active mds requests + * .../monc - mon client state + * .../dentry_lru - dump contents of dentry lru + * .../caps - expose cap (reservation) stats + * .../bdi - symlink to ../../bdi/something + */ + +static struct dentry *ceph_debugfs_dir; + +static int monmap_show(struct seq_file *s, void *p) +{ + int i; + struct ceph_client *client = s->private; + + if (client->monc.monmap == NULL) + return 0; + + seq_printf(s, "epoch %d\n", client->monc.monmap->epoch); + for (i = 0; i < client->monc.monmap->num_mon; i++) { + struct ceph_entity_inst *inst = + &client->monc.monmap->mon_inst[i]; + + seq_printf(s, "\t%s%lld\t%s\n", + ENTITY_NAME(inst->name), + pr_addr(&inst->addr.in_addr)); + } + return 0; +} + +static int mdsmap_show(struct seq_file *s, void *p) +{ + int i; + struct ceph_client *client = s->private; + + if (client->mdsc.mdsmap == NULL) + return 0; + seq_printf(s, "epoch %d\n", client->mdsc.mdsmap->m_epoch); + seq_printf(s, "root %d\n", client->mdsc.mdsmap->m_root); + seq_printf(s, "session_timeout %d\n", + client->mdsc.mdsmap->m_session_timeout); + seq_printf(s, "session_autoclose %d\n", + client->mdsc.mdsmap->m_session_autoclose); + for (i = 0; i < client->mdsc.mdsmap->m_max_mds; i++) { + struct ceph_entity_addr *addr = + &client->mdsc.mdsmap->m_info[i].addr; + int state = client->mdsc.mdsmap->m_info[i].state; + + seq_printf(s, "\tmds%d\t%s\t(%s)\n", i, pr_addr(&addr->in_addr), + ceph_mds_state_name(state)); + } + return 0; +} + +static int osdmap_show(struct seq_file *s, void *p) +{ + int i; + struct ceph_client *client = s->private; + struct rb_node *n; + + if (client->osdc.osdmap == NULL) + return 0; + seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch); + seq_printf(s, "flags%s%s\n", + (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ? + " NEARFULL" : "", + (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ? + " FULL" : ""); + for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) { + struct ceph_pg_pool_info *pool = + rb_entry(n, struct ceph_pg_pool_info, node); + seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n", + pool->id, pool->v.pg_num, pool->pg_num_mask, + pool->v.lpg_num, pool->lpg_num_mask); + } + for (i = 0; i < client->osdc.osdmap->max_osd; i++) { + struct ceph_entity_addr *addr = + &client->osdc.osdmap->osd_addr[i]; + int state = client->osdc.osdmap->osd_state[i]; + char sb[64]; + + seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n", + i, pr_addr(&addr->in_addr), + ((client->osdc.osdmap->osd_weight[i]*100) >> 16), + ceph_osdmap_state_str(sb, sizeof(sb), state)); + } + return 0; +} + +static int monc_show(struct seq_file *s, void *p) +{ + struct ceph_client *client = s->private; + struct ceph_mon_statfs_request *req; + struct ceph_mon_client *monc = &client->monc; + struct rb_node *rp; + + mutex_lock(&monc->mutex); + + if (monc->have_mdsmap) + seq_printf(s, "have mdsmap %u\n", (unsigned)monc->have_mdsmap); + if (monc->have_osdmap) + seq_printf(s, "have osdmap %u\n", (unsigned)monc->have_osdmap); + if (monc->want_next_osdmap) + seq_printf(s, "want next osdmap\n"); + + for (rp = rb_first(&monc->statfs_request_tree); rp; rp = rb_next(rp)) { + req = rb_entry(rp, struct ceph_mon_statfs_request, node); + seq_printf(s, "%lld statfs\n", req->tid); + } + + mutex_unlock(&monc->mutex); + return 0; +} + +static int mdsc_show(struct seq_file *s, void *p) +{ + struct ceph_client *client = s->private; + struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_mds_request *req; + struct rb_node *rp; + int pathlen; + u64 pathbase; + char *path; + + mutex_lock(&mdsc->mutex); + for (rp = rb_first(&mdsc->request_tree); rp; rp = rb_next(rp)) { + req = rb_entry(rp, struct ceph_mds_request, r_node); + + if (req->r_request) + seq_printf(s, "%lld\tmds%d\t", req->r_tid, req->r_mds); + else + seq_printf(s, "%lld\t(no request)\t", req->r_tid); + + seq_printf(s, "%s", ceph_mds_op_name(req->r_op)); + + if (req->r_got_unsafe) + seq_printf(s, "\t(unsafe)"); + else + seq_printf(s, "\t"); + + if (req->r_inode) { + seq_printf(s, " #%llx", ceph_ino(req->r_inode)); + } else if (req->r_dentry) { + path = ceph_mdsc_build_path(req->r_dentry, &pathlen, + &pathbase, 0); + spin_lock(&req->r_dentry->d_lock); + seq_printf(s, " #%llx/%.*s (%s)", + ceph_ino(req->r_dentry->d_parent->d_inode), + req->r_dentry->d_name.len, + req->r_dentry->d_name.name, + path ? path : ""); + spin_unlock(&req->r_dentry->d_lock); + kfree(path); + } else if (req->r_path1) { + seq_printf(s, " #%llx/%s", req->r_ino1.ino, + req->r_path1); + } + + if (req->r_old_dentry) { + path = ceph_mdsc_build_path(req->r_old_dentry, &pathlen, + &pathbase, 0); + spin_lock(&req->r_old_dentry->d_lock); + seq_printf(s, " #%llx/%.*s (%s)", + ceph_ino(req->r_old_dentry->d_parent->d_inode), + req->r_old_dentry->d_name.len, + req->r_old_dentry->d_name.name, + path ? path : ""); + spin_unlock(&req->r_old_dentry->d_lock); + kfree(path); + } else if (req->r_path2) { + if (req->r_ino2.ino) + seq_printf(s, " #%llx/%s", req->r_ino2.ino, + req->r_path2); + else + seq_printf(s, " %s", req->r_path2); + } + + seq_printf(s, "\n"); + } + mutex_unlock(&mdsc->mutex); + + return 0; +} + +static int osdc_show(struct seq_file *s, void *pp) +{ + struct ceph_client *client = s->private; + struct ceph_osd_client *osdc = &client->osdc; + struct rb_node *p; + + mutex_lock(&osdc->request_mutex); + for (p = rb_first(&osdc->requests); p; p = rb_next(p)) { + struct ceph_osd_request *req; + struct ceph_osd_request_head *head; + struct ceph_osd_op *op; + int num_ops; + int opcode, olen; + int i; + + req = rb_entry(p, struct ceph_osd_request, r_node); + + seq_printf(s, "%lld\tosd%d\t%d.%x\t", req->r_tid, + req->r_osd ? req->r_osd->o_osd : -1, + le32_to_cpu(req->r_pgid.pool), + le16_to_cpu(req->r_pgid.ps)); + + head = req->r_request->front.iov_base; + op = (void *)(head + 1); + + num_ops = le16_to_cpu(head->num_ops); + olen = le32_to_cpu(head->object_len); + seq_printf(s, "%.*s", olen, + (const char *)(head->ops + num_ops)); + + if (req->r_reassert_version.epoch) + seq_printf(s, "\t%u'%llu", + (unsigned)le32_to_cpu(req->r_reassert_version.epoch), + le64_to_cpu(req->r_reassert_version.version)); + else + seq_printf(s, "\t"); + + for (i = 0; i < num_ops; i++) { + opcode = le16_to_cpu(op->op); + seq_printf(s, "\t%s", ceph_osd_op_name(opcode)); + op++; + } + + seq_printf(s, "\n"); + } + mutex_unlock(&osdc->request_mutex); + return 0; +} + +static int caps_show(struct seq_file *s, void *p) +{ + struct ceph_client *client = p; + int total, avail, used, reserved, min; + + ceph_reservation_status(client, &total, &avail, &used, &reserved, &min); + seq_printf(s, "total\t\t%d\n" + "avail\t\t%d\n" + "used\t\t%d\n" + "reserved\t%d\n" + "min\t%d\n", + total, avail, used, reserved, min); + return 0; +} + +static int dentry_lru_show(struct seq_file *s, void *ptr) +{ + struct ceph_client *client = s->private; + struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_dentry_info *di; + + spin_lock(&mdsc->dentry_lru_lock); + list_for_each_entry(di, &mdsc->dentry_lru, lru) { + struct dentry *dentry = di->dentry; + seq_printf(s, "%p %p\t%.*s\n", + di, dentry, dentry->d_name.len, dentry->d_name.name); + } + spin_unlock(&mdsc->dentry_lru_lock); + + return 0; +} + +#define DEFINE_SHOW_FUNC(name) \ +static int name##_open(struct inode *inode, struct file *file) \ +{ \ + struct seq_file *sf; \ + int ret; \ + \ + ret = single_open(file, name, NULL); \ + sf = file->private_data; \ + sf->private = inode->i_private; \ + return ret; \ +} \ + \ +static const struct file_operations name##_fops = { \ + .open = name##_open, \ + .read = seq_read, \ + .llseek = seq_lseek, \ + .release = single_release, \ +}; + +DEFINE_SHOW_FUNC(monmap_show) +DEFINE_SHOW_FUNC(mdsmap_show) +DEFINE_SHOW_FUNC(osdmap_show) +DEFINE_SHOW_FUNC(monc_show) +DEFINE_SHOW_FUNC(mdsc_show) +DEFINE_SHOW_FUNC(osdc_show) +DEFINE_SHOW_FUNC(dentry_lru_show) +DEFINE_SHOW_FUNC(caps_show) + +static int congestion_kb_set(void *data, u64 val) +{ + struct ceph_client *client = (struct ceph_client *)data; + + if (client) + client->mount_args->congestion_kb = (int)val; + + return 0; +} + +static int congestion_kb_get(void *data, u64 *val) +{ + struct ceph_client *client = (struct ceph_client *)data; + + if (client) + *val = (u64)client->mount_args->congestion_kb; + + return 0; +} + + +DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get, + congestion_kb_set, "%llu\n"); + +int __init ceph_debugfs_init(void) +{ + ceph_debugfs_dir = debugfs_create_dir("ceph", NULL); + if (!ceph_debugfs_dir) + return -ENOMEM; + return 0; +} + +void ceph_debugfs_cleanup(void) +{ + debugfs_remove(ceph_debugfs_dir); +} + +int ceph_debugfs_client_init(struct ceph_client *client) +{ + int ret = 0; + char name[80]; + + snprintf(name, sizeof(name), FSID_FORMAT ".client%lld", + PR_FSID(&client->fsid), client->monc.auth->global_id); + + client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir); + if (!client->debugfs_dir) + goto out; + + client->monc.debugfs_file = debugfs_create_file("monc", + 0600, + client->debugfs_dir, + client, + &monc_show_fops); + if (!client->monc.debugfs_file) + goto out; + + client->mdsc.debugfs_file = debugfs_create_file("mdsc", + 0600, + client->debugfs_dir, + client, + &mdsc_show_fops); + if (!client->mdsc.debugfs_file) + goto out; + + client->osdc.debugfs_file = debugfs_create_file("osdc", + 0600, + client->debugfs_dir, + client, + &osdc_show_fops); + if (!client->osdc.debugfs_file) + goto out; + + client->debugfs_monmap = debugfs_create_file("monmap", + 0600, + client->debugfs_dir, + client, + &monmap_show_fops); + if (!client->debugfs_monmap) + goto out; + + client->debugfs_mdsmap = debugfs_create_file("mdsmap", + 0600, + client->debugfs_dir, + client, + &mdsmap_show_fops); + if (!client->debugfs_mdsmap) + goto out; + + client->debugfs_osdmap = debugfs_create_file("osdmap", + 0600, + client->debugfs_dir, + client, + &osdmap_show_fops); + if (!client->debugfs_osdmap) + goto out; + + client->debugfs_dentry_lru = debugfs_create_file("dentry_lru", + 0600, + client->debugfs_dir, + client, + &dentry_lru_show_fops); + if (!client->debugfs_dentry_lru) + goto out; + + client->debugfs_caps = debugfs_create_file("caps", + 0400, + client->debugfs_dir, + client, + &caps_show_fops); + if (!client->debugfs_caps) + goto out; + + client->debugfs_congestion_kb = debugfs_create_file("writeback_congestion_kb", + 0600, + client->debugfs_dir, + client, + &congestion_kb_fops); + if (!client->debugfs_congestion_kb) + goto out; + + sprintf(name, "../../bdi/%s", dev_name(client->sb->s_bdi->dev)); + client->debugfs_bdi = debugfs_create_symlink("bdi", client->debugfs_dir, + name); + + return 0; + +out: + ceph_debugfs_client_cleanup(client); + return ret; +} + +void ceph_debugfs_client_cleanup(struct ceph_client *client) +{ + debugfs_remove(client->debugfs_bdi); + debugfs_remove(client->debugfs_caps); + debugfs_remove(client->debugfs_dentry_lru); + debugfs_remove(client->debugfs_osdmap); + debugfs_remove(client->debugfs_mdsmap); + debugfs_remove(client->debugfs_monmap); + debugfs_remove(client->osdc.debugfs_file); + debugfs_remove(client->mdsc.debugfs_file); + debugfs_remove(client->monc.debugfs_file); + debugfs_remove(client->debugfs_congestion_kb); + debugfs_remove(client->debugfs_dir); +} + +#else // CONFIG_DEBUG_FS + +int __init ceph_debugfs_init(void) +{ + return 0; +} + +void ceph_debugfs_cleanup(void) +{ +} + +int ceph_debugfs_client_init(struct ceph_client *client) +{ + return 0; +} + +void ceph_debugfs_client_cleanup(struct ceph_client *client) +{ +} + +#endif // CONFIG_DEBUG_FS diff --git a/fs/ceph/decode.h b/fs/ceph/decode.h new file mode 100644 index 00000000000..65b3e022eaf --- /dev/null +++ b/fs/ceph/decode.h @@ -0,0 +1,194 @@ +#ifndef __CEPH_DECODE_H +#define __CEPH_DECODE_H + +#include <asm/unaligned.h> +#include <linux/time.h> + +#include "types.h" + +/* + * in all cases, + * void **p pointer to position pointer + * void *end pointer to end of buffer (last byte + 1) + */ + +static inline u64 ceph_decode_64(void **p) +{ + u64 v = get_unaligned_le64(*p); + *p += sizeof(u64); + return v; +} +static inline u32 ceph_decode_32(void **p) +{ + u32 v = get_unaligned_le32(*p); + *p += sizeof(u32); + return v; +} +static inline u16 ceph_decode_16(void **p) +{ + u16 v = get_unaligned_le16(*p); + *p += sizeof(u16); + return v; +} +static inline u8 ceph_decode_8(void **p) +{ + u8 v = *(u8 *)*p; + (*p)++; + return v; +} +static inline void ceph_decode_copy(void **p, void *pv, size_t n) +{ + memcpy(pv, *p, n); + *p += n; +} + +/* + * bounds check input. + */ +#define ceph_decode_need(p, end, n, bad) \ + do { \ + if (unlikely(*(p) + (n) > (end))) \ + goto bad; \ + } while (0) + +#define ceph_decode_64_safe(p, end, v, bad) \ + do { \ + ceph_decode_need(p, end, sizeof(u64), bad); \ + v = ceph_decode_64(p); \ + } while (0) +#define ceph_decode_32_safe(p, end, v, bad) \ + do { \ + ceph_decode_need(p, end, sizeof(u32), bad); \ + v = ceph_decode_32(p); \ + } while (0) +#define ceph_decode_16_safe(p, end, v, bad) \ + do { \ + ceph_decode_need(p, end, sizeof(u16), bad); \ + v = ceph_decode_16(p); \ + } while (0) +#define ceph_decode_8_safe(p, end, v, bad) \ + do { \ + ceph_decode_need(p, end, sizeof(u8), bad); \ + v = ceph_decode_8(p); \ + } while (0) + +#define ceph_decode_copy_safe(p, end, pv, n, bad) \ + do { \ + ceph_decode_need(p, end, n, bad); \ + ceph_decode_copy(p, pv, n); \ + } while (0) + +/* + * struct ceph_timespec <-> struct timespec + */ +static inline void ceph_decode_timespec(struct timespec *ts, + const struct ceph_timespec *tv) +{ + ts->tv_sec = le32_to_cpu(tv->tv_sec); + ts->tv_nsec = le32_to_cpu(tv->tv_nsec); +} +static inline void ceph_encode_timespec(struct ceph_timespec *tv, + const struct timespec *ts) +{ + tv->tv_sec = cpu_to_le32(ts->tv_sec); + tv->tv_nsec = cpu_to_le32(ts->tv_nsec); +} + +/* + * sockaddr_storage <-> ceph_sockaddr + */ +static inline void ceph_encode_addr(struct ceph_entity_addr *a) +{ + a->in_addr.ss_family = htons(a->in_addr.ss_family); +} +static inline void ceph_decode_addr(struct ceph_entity_addr *a) +{ + a->in_addr.ss_family = ntohs(a->in_addr.ss_family); + WARN_ON(a->in_addr.ss_family == 512); +} + +/* + * encoders + */ +static inline void ceph_encode_64(void **p, u64 v) +{ + put_unaligned_le64(v, (__le64 *)*p); + *p += sizeof(u64); +} +static inline void ceph_encode_32(void **p, u32 v) +{ + put_unaligned_le32(v, (__le32 *)*p); + *p += sizeof(u32); +} +static inline void ceph_encode_16(void **p, u16 v) +{ + put_unaligned_le16(v, (__le16 *)*p); + *p += sizeof(u16); +} +static inline void ceph_encode_8(void **p, u8 v) +{ + *(u8 *)*p = v; + (*p)++; +} +static inline void ceph_encode_copy(void **p, const void *s, int len) +{ + memcpy(*p, s, len); + *p += len; +} + +/* + * filepath, string encoders + */ +static inline void ceph_encode_filepath(void **p, void *end, + u64 ino, const char *path) +{ + u32 len = path ? strlen(path) : 0; + BUG_ON(*p + sizeof(ino) + sizeof(len) + len > end); + ceph_encode_8(p, 1); + ceph_encode_64(p, ino); + ceph_encode_32(p, len); + if (len) + memcpy(*p, path, len); + *p += len; +} + +static inline void ceph_encode_string(void **p, void *end, + const char *s, u32 len) +{ + BUG_ON(*p + sizeof(len) + len > end); + ceph_encode_32(p, len); + if (len) + memcpy(*p, s, len); + *p += len; +} + +#define ceph_encode_need(p, end, n, bad) \ + do { \ + if (unlikely(*(p) + (n) > (end))) \ + goto bad; \ + } while (0) + +#define ceph_encode_64_safe(p, end, v, bad) \ + do { \ + ceph_encode_need(p, end, sizeof(u64), bad); \ + ceph_encode_64(p, v); \ + } while (0) +#define ceph_encode_32_safe(p, end, v, bad) \ + do { \ + ceph_encode_need(p, end, sizeof(u32), bad); \ + ceph_encode_32(p, v); \ + } while (0) +#define ceph_encode_16_safe(p, end, v, bad) \ + do { \ + ceph_encode_need(p, end, sizeof(u16), bad); \ + ceph_encode_16(p, v); \ + } while (0) + +#define ceph_encode_copy_safe(p, end, pv, n, bad) \ + do { \ + ceph_encode_need(p, end, n, bad); \ + ceph_encode_copy(p, pv, n); \ + } while (0) + + +#endif diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c new file mode 100644 index 00000000000..5107384ee02 --- /dev/null +++ b/fs/ceph/dir.c @@ -0,0 +1,1220 @@ +#include "ceph_debug.h" + +#include <linux/spinlock.h> +#include <linux/fs_struct.h> +#include <linux/namei.h> +#include <linux/sched.h> + +#include "super.h" + +/* + * Directory operations: readdir, lookup, create, link, unlink, + * rename, etc. + */ + +/* + * Ceph MDS operations are specified in terms of a base ino and + * relative path. Thus, the client can specify an operation on a + * specific inode (e.g., a getattr due to fstat(2)), or as a path + * relative to, say, the root directory. + * + * Normally, we limit ourselves to strict inode ops (no path component) + * or dentry operations (a single path component relative to an ino). The + * exception to this is open_root_dentry(), which will open the mount + * point by name. + */ + +const struct inode_operations ceph_dir_iops; +const struct file_operations ceph_dir_fops; +struct dentry_operations ceph_dentry_ops; + +/* + * Initialize ceph dentry state. + */ +int ceph_init_dentry(struct dentry *dentry) +{ + struct ceph_dentry_info *di; + + if (dentry->d_fsdata) + return 0; + + if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) + dentry->d_op = &ceph_dentry_ops; + else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR) + dentry->d_op = &ceph_snapdir_dentry_ops; + else + dentry->d_op = &ceph_snap_dentry_ops; + + di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS); + if (!di) + return -ENOMEM; /* oh well */ + + spin_lock(&dentry->d_lock); + if (dentry->d_fsdata) /* lost a race */ + goto out_unlock; + di->dentry = dentry; + di->lease_session = NULL; + dentry->d_fsdata = di; + dentry->d_time = jiffies; + ceph_dentry_lru_add(dentry); +out_unlock: + spin_unlock(&dentry->d_lock); + return 0; +} + + + +/* + * for readdir, we encode the directory frag and offset within that + * frag into f_pos. + */ +static unsigned fpos_frag(loff_t p) +{ + return p >> 32; +} +static unsigned fpos_off(loff_t p) +{ + return p & 0xffffffff; +} + +/* + * When possible, we try to satisfy a readdir by peeking at the + * dcache. We make this work by carefully ordering dentries on + * d_u.d_child when we initially get results back from the MDS, and + * falling back to a "normal" sync readdir if any dentries in the dir + * are dropped. + * + * I_COMPLETE tells indicates we have all dentries in the dir. It is + * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by + * the MDS if/when the directory is modified). + */ +static int __dcache_readdir(struct file *filp, + void *dirent, filldir_t filldir) +{ + struct inode *inode = filp->f_dentry->d_inode; + struct ceph_file_info *fi = filp->private_data; + struct dentry *parent = filp->f_dentry; + struct inode *dir = parent->d_inode; + struct list_head *p; + struct dentry *dentry, *last; + struct ceph_dentry_info *di; + int err = 0; + + /* claim ref on last dentry we returned */ + last = fi->dentry; + fi->dentry = NULL; + + dout("__dcache_readdir %p at %llu (last %p)\n", dir, filp->f_pos, + last); + + spin_lock(&dcache_lock); + + /* start at beginning? */ + if (filp->f_pos == 2 || (last && + filp->f_pos < ceph_dentry(last)->offset)) { + if (list_empty(&parent->d_subdirs)) + goto out_unlock; + p = parent->d_subdirs.prev; + dout(" initial p %p/%p\n", p->prev, p->next); + } else { + p = last->d_u.d_child.prev; + } + +more: + dentry = list_entry(p, struct dentry, d_u.d_child); + di = ceph_dentry(dentry); + while (1) { + dout(" p %p/%p d_subdirs %p/%p\n", p->prev, p->next, + parent->d_subdirs.prev, parent->d_subdirs.next); + if (p == &parent->d_subdirs) { + fi->at_end = 1; + goto out_unlock; + } + if (!d_unhashed(dentry) && dentry->d_inode && + ceph_snap(dentry->d_inode) != CEPH_SNAPDIR && + ceph_ino(dentry->d_inode) != CEPH_INO_CEPH && + filp->f_pos <= di->offset) + break; + dout(" skipping %p %.*s at %llu (%llu)%s%s\n", dentry, + dentry->d_name.len, dentry->d_name.name, di->offset, + filp->f_pos, d_unhashed(dentry) ? " unhashed" : "", + !dentry->d_inode ? " null" : ""); + p = p->prev; + dentry = list_entry(p, struct dentry, d_u.d_child); + di = ceph_dentry(dentry); + } + + atomic_inc(&dentry->d_count); + spin_unlock(&dcache_lock); + spin_unlock(&inode->i_lock); + + dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos, + dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode); + filp->f_pos = di->offset; + err = filldir(dirent, dentry->d_name.name, + dentry->d_name.len, di->offset, + dentry->d_inode->i_ino, + dentry->d_inode->i_mode >> 12); + + if (last) { + if (err < 0) { + /* remember our position */ + fi->dentry = last; + fi->next_offset = di->offset; + } else { + dput(last); + } + last = NULL; + } + + spin_lock(&inode->i_lock); + spin_lock(&dcache_lock); + + if (err < 0) + goto out_unlock; + + last = dentry; + + p = p->prev; + filp->f_pos++; + + /* make sure a dentry wasn't dropped while we didn't have dcache_lock */ + if ((ceph_inode(dir)->i_ceph_flags & CEPH_I_COMPLETE)) + goto more; + dout(" lost I_COMPLETE on %p; falling back to mds\n", dir); + err = -EAGAIN; + +out_unlock: + spin_unlock(&dcache_lock); + + if (last) { + spin_unlock(&inode->i_lock); + dput(last); + spin_lock(&inode->i_lock); + } + + return err; +} + +/* + * make note of the last dentry we read, so we can + * continue at the same lexicographical point, + * regardless of what dir changes take place on the + * server. + */ +static int note_last_dentry(struct ceph_file_info *fi, const char *name, + int len) +{ + kfree(fi->last_name); + fi->last_name = kmalloc(len+1, GFP_NOFS); + if (!fi->last_name) + return -ENOMEM; + memcpy(fi->last_name, name, len); + fi->last_name[len] = 0; + dout("note_last_dentry '%s'\n", fi->last_name); + return 0; +} + +static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir) +{ + struct ceph_file_info *fi = filp->private_data; + struct inode *inode = filp->f_dentry->d_inode; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_client *client = ceph_inode_to_client(inode); + struct ceph_mds_client *mdsc = &client->mdsc; + unsigned frag = fpos_frag(filp->f_pos); + int off = fpos_off(filp->f_pos); + int err; + u32 ftype; + struct ceph_mds_reply_info_parsed *rinfo; + const int max_entries = client->mount_args->max_readdir; + + dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off); + if (fi->at_end) + return 0; + + /* always start with . and .. */ + if (filp->f_pos == 0) { + /* note dir version at start of readdir so we can tell + * if any dentries get dropped */ + fi->dir_release_count = ci->i_release_count; + + dout("readdir off 0 -> '.'\n"); + if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0), + inode->i_ino, inode->i_mode >> 12) < 0) + return 0; + filp->f_pos = 1; + off = 1; + } + if (filp->f_pos == 1) { + dout("readdir off 1 -> '..'\n"); + if (filldir(dirent, "..", 2, ceph_make_fpos(0, 1), + filp->f_dentry->d_parent->d_inode->i_ino, + inode->i_mode >> 12) < 0) + return 0; + filp->f_pos = 2; + off = 2; + } + + /* can we use the dcache? */ + spin_lock(&inode->i_lock); + if ((filp->f_pos == 2 || fi->dentry) && + !ceph_test_opt(client, NOASYNCREADDIR) && + (ci->i_ceph_flags & CEPH_I_COMPLETE) && + __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) { + err = __dcache_readdir(filp, dirent, filldir); + if (err != -EAGAIN) { + spin_unlock(&inode->i_lock); + return err; + } + } + spin_unlock(&inode->i_lock); + if (fi->dentry) { + err = note_last_dentry(fi, fi->dentry->d_name.name, + fi->dentry->d_name.len); + if (err) + return err; + dput(fi->dentry); + fi->dentry = NULL; + } + + /* proceed with a normal readdir */ + +more: + /* do we have the correct frag content buffered? */ + if (fi->frag != frag || fi->last_readdir == NULL) { + struct ceph_mds_request *req; + int op = ceph_snap(inode) == CEPH_SNAPDIR ? + CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR; + + /* discard old result, if any */ + if (fi->last_readdir) + ceph_mdsc_put_request(fi->last_readdir); + + /* requery frag tree, as the frag topology may have changed */ + frag = ceph_choose_frag(ceph_inode(inode), frag, NULL, NULL); + + dout("readdir fetching %llx.%llx frag %x offset '%s'\n", + ceph_vinop(inode), frag, fi->last_name); + req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); + if (IS_ERR(req)) + return PTR_ERR(req); + req->r_inode = igrab(inode); + req->r_dentry = dget(filp->f_dentry); + /* hints to request -> mds selection code */ + req->r_direct_mode = USE_AUTH_MDS; + req->r_direct_hash = ceph_frag_value(frag); + req->r_direct_is_hash = true; + req->r_path2 = kstrdup(fi->last_name, GFP_NOFS); + req->r_readdir_offset = fi->next_offset; + req->r_args.readdir.frag = cpu_to_le32(frag); + req->r_args.readdir.max_entries = cpu_to_le32(max_entries); + req->r_num_caps = max_entries; + err = ceph_mdsc_do_request(mdsc, NULL, req); + if (err < 0) { + ceph_mdsc_put_request(req); + return err; + } + dout("readdir got and parsed readdir result=%d" + " on frag %x, end=%d, complete=%d\n", err, frag, + (int)req->r_reply_info.dir_end, + (int)req->r_reply_info.dir_complete); + + if (!req->r_did_prepopulate) { + dout("readdir !did_prepopulate"); + fi->dir_release_count--; /* preclude I_COMPLETE */ + } + + /* note next offset and last dentry name */ + fi->offset = fi->next_offset; + fi->last_readdir = req; + + if (req->r_reply_info.dir_end) { + kfree(fi->last_name); + fi->last_name = NULL; + fi->next_offset = 0; + } else { + rinfo = &req->r_reply_info; + err = note_last_dentry(fi, + rinfo->dir_dname[rinfo->dir_nr-1], + rinfo->dir_dname_len[rinfo->dir_nr-1]); + if (err) + return err; + fi->next_offset += rinfo->dir_nr; + } + } + + rinfo = &fi->last_readdir->r_reply_info; + dout("readdir frag %x num %d off %d chunkoff %d\n", frag, + rinfo->dir_nr, off, fi->offset); + while (off - fi->offset >= 0 && off - fi->offset < rinfo->dir_nr) { + u64 pos = ceph_make_fpos(frag, off); + struct ceph_mds_reply_inode *in = + rinfo->dir_in[off - fi->offset].in; + dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n", + off, off - fi->offset, rinfo->dir_nr, pos, + rinfo->dir_dname_len[off - fi->offset], + rinfo->dir_dname[off - fi->offset], in); + BUG_ON(!in); + ftype = le32_to_cpu(in->mode) >> 12; + if (filldir(dirent, + rinfo->dir_dname[off - fi->offset], + rinfo->dir_dname_len[off - fi->offset], + pos, + le64_to_cpu(in->ino), + ftype) < 0) { + dout("filldir stopping us...\n"); + return 0; + } + off++; + filp->f_pos = pos + 1; + } + + if (fi->last_name) { + ceph_mdsc_put_request(fi->last_readdir); + fi->last_readdir = NULL; + goto more; + } + + /* more frags? */ + if (!ceph_frag_is_rightmost(frag)) { + frag = ceph_frag_next(frag); + off = 0; + filp->f_pos = ceph_make_fpos(frag, off); + dout("readdir next frag is %x\n", frag); + goto more; + } + fi->at_end = 1; + + /* + * if dir_release_count still matches the dir, no dentries + * were released during the whole readdir, and we should have + * the complete dir contents in our cache. + */ + spin_lock(&inode->i_lock); + if (ci->i_release_count == fi->dir_release_count) { + dout(" marking %p complete\n", inode); + ci->i_ceph_flags |= CEPH_I_COMPLETE; + ci->i_max_offset = filp->f_pos; + } + spin_unlock(&inode->i_lock); + + dout("readdir %p filp %p done.\n", inode, filp); + return 0; +} + +static void reset_readdir(struct ceph_file_info *fi) +{ + if (fi->last_readdir) { + ceph_mdsc_put_request(fi->last_readdir); + fi->last_readdir = NULL; + } + kfree(fi->last_name); + fi->next_offset = 2; /* compensate for . and .. */ + if (fi->dentry) { + dput(fi->dentry); + fi->dentry = NULL; + } + fi->at_end = 0; +} + +static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin) +{ + struct ceph_file_info *fi = file->private_data; + struct inode *inode = file->f_mapping->host; + loff_t old_offset = offset; + loff_t retval; + + mutex_lock(&inode->i_mutex); + switch (origin) { + case SEEK_END: + offset += inode->i_size + 2; /* FIXME */ + break; + case SEEK_CUR: + offset += file->f_pos; + } + retval = -EINVAL; + if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) { + if (offset != file->f_pos) { + file->f_pos = offset; + file->f_version = 0; + fi->at_end = 0; + } + retval = offset; + + /* + * discard buffered readdir content on seekdir(0), or + * seek to new frag, or seek prior to current chunk. + */ + if (offset == 0 || + fpos_frag(offset) != fpos_frag(old_offset) || + fpos_off(offset) < fi->offset) { + dout("dir_llseek dropping %p content\n", file); + reset_readdir(fi); + } + + /* bump dir_release_count if we did a forward seek */ + if (offset > old_offset) + fi->dir_release_count--; + } + mutex_unlock(&inode->i_mutex); + return retval; +} + +/* + * Process result of a lookup/open request. + * + * Mainly, make sure we return the final req->r_dentry (if it already + * existed) in place of the original VFS-provided dentry when they + * differ. + * + * Gracefully handle the case where the MDS replies with -ENOENT and + * no trace (which it may do, at its discretion, e.g., if it doesn't + * care to issue a lease on the negative dentry). + */ +struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, + struct dentry *dentry, int err) +{ + struct ceph_client *client = ceph_client(dentry->d_sb); + struct inode *parent = dentry->d_parent->d_inode; + + /* .snap dir? */ + if (err == -ENOENT && + ceph_vino(parent).ino != CEPH_INO_ROOT && /* no .snap in root dir */ + strcmp(dentry->d_name.name, + client->mount_args->snapdir_name) == 0) { + struct inode *inode = ceph_get_snapdir(parent); + dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n", + dentry, dentry->d_name.len, dentry->d_name.name, inode); + d_add(dentry, inode); + err = 0; + } + + if (err == -ENOENT) { + /* no trace? */ + err = 0; + if (!req->r_reply_info.head->is_dentry) { + dout("ENOENT and no trace, dentry %p inode %p\n", + dentry, dentry->d_inode); + if (dentry->d_inode) { + d_drop(dentry); + err = -ENOENT; + } else { + d_add(dentry, NULL); + } + } + } + if (err) + dentry = ERR_PTR(err); + else if (dentry != req->r_dentry) + dentry = dget(req->r_dentry); /* we got spliced */ + else + dentry = NULL; + return dentry; +} + +static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry) +{ + return ceph_ino(inode) == CEPH_INO_ROOT && + strncmp(dentry->d_name.name, ".ceph", 5) == 0; +} + +/* + * Look up a single dir entry. If there is a lookup intent, inform + * the MDS so that it gets our 'caps wanted' value in a single op. + */ +static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + struct ceph_client *client = ceph_sb_to_client(dir->i_sb); + struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_mds_request *req; + int op; + int err; + + dout("lookup %p dentry %p '%.*s'\n", + dir, dentry, dentry->d_name.len, dentry->d_name.name); + + if (dentry->d_name.len > NAME_MAX) + return ERR_PTR(-ENAMETOOLONG); + + err = ceph_init_dentry(dentry); + if (err < 0) + return ERR_PTR(err); + + /* open (but not create!) intent? */ + if (nd && + (nd->flags & LOOKUP_OPEN) && + (nd->flags & LOOKUP_CONTINUE) == 0 && /* only open last component */ + !(nd->intent.open.flags & O_CREAT)) { + int mode = nd->intent.open.create_mode & ~current->fs->umask; + return ceph_lookup_open(dir, dentry, nd, mode, 1); + } + + /* can we conclude ENOENT locally? */ + if (dentry->d_inode == NULL) { + struct ceph_inode_info *ci = ceph_inode(dir); + struct ceph_dentry_info *di = ceph_dentry(dentry); + + spin_lock(&dir->i_lock); + dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags); + if (strncmp(dentry->d_name.name, + client->mount_args->snapdir_name, + dentry->d_name.len) && + !is_root_ceph_dentry(dir, dentry) && + (ci->i_ceph_flags & CEPH_I_COMPLETE) && + (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) { + di->offset = ci->i_max_offset++; + spin_unlock(&dir->i_lock); + dout(" dir %p complete, -ENOENT\n", dir); + d_add(dentry, NULL); + di->lease_shared_gen = ci->i_shared_gen; + return NULL; + } + spin_unlock(&dir->i_lock); + } + + op = ceph_snap(dir) == CEPH_SNAPDIR ? + CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP; + req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS); + if (IS_ERR(req)) + return ERR_PTR(PTR_ERR(req)); + req->r_dentry = dget(dentry); + req->r_num_caps = 2; + /* we only need inode linkage */ + req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE); + req->r_locked_dir = dir; + err = ceph_mdsc_do_request(mdsc, NULL, req); + dentry = ceph_finish_lookup(req, dentry, err); + ceph_mdsc_put_request(req); /* will dput(dentry) */ + dout("lookup result=%p\n", dentry); + return dentry; +} + +/* + * If we do a create but get no trace back from the MDS, follow up with + * a lookup (the VFS expects us to link up the provided dentry). + */ +int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry) +{ + struct dentry *result = ceph_lookup(dir, dentry, NULL); + + if (result && !IS_ERR(result)) { + /* + * We created the item, then did a lookup, and found + * it was already linked to another inode we already + * had in our cache (and thus got spliced). Link our + * dentry to that inode, but don't hash it, just in + * case the VFS wants to dereference it. + */ + BUG_ON(!result->d_inode); + d_instantiate(dentry, result->d_inode); + return 0; + } + return PTR_ERR(result); +} + +static int ceph_mknod(struct inode *dir, struct dentry *dentry, + int mode, dev_t rdev) +{ + struct ceph_client *client = ceph_sb_to_client(dir->i_sb); + struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_mds_request *req; + int err; + + if (ceph_snap(dir) != CEPH_NOSNAP) + return -EROFS; + + dout("mknod in dir %p dentry %p mode 0%o rdev %d\n", + dir, dentry, mode, rdev); + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_MKNOD, USE_AUTH_MDS); + if (IS_ERR(req)) { + d_drop(dentry); + return PTR_ERR(req); + } + req->r_dentry = dget(dentry); + req->r_num_caps = 2; + req->r_locked_dir = dir; + req->r_args.mknod.mode = cpu_to_le32(mode); + req->r_args.mknod.rdev = cpu_to_le32(rdev); + req->r_dentry_drop = CEPH_CAP_FILE_SHARED; + req->r_dentry_unless = CEPH_CAP_FILE_EXCL; + err = ceph_mdsc_do_request(mdsc, dir, req); + if (!err && !req->r_reply_info.head->is_dentry) + err = ceph_handle_notrace_create(dir, dentry); + ceph_mdsc_put_request(req); + if (err) + d_drop(dentry); + return err; +} + +static int ceph_create(struct inode *dir, struct dentry *dentry, int mode, + struct nameidata *nd) +{ + dout("create in dir %p dentry %p name '%.*s'\n", + dir, dentry, dentry->d_name.len, dentry->d_name.name); + + if (ceph_snap(dir) != CEPH_NOSNAP) + return -EROFS; + + if (nd) { + BUG_ON((nd->flags & LOOKUP_OPEN) == 0); + dentry = ceph_lookup_open(dir, dentry, nd, mode, 0); + /* hrm, what should i do here if we get aliased? */ + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + return 0; + } + + /* fall back to mknod */ + return ceph_mknod(dir, dentry, (mode & ~S_IFMT) | S_IFREG, 0); +} + +static int ceph_symlink(struct inode *dir, struct dentry *dentry, + const char *dest) +{ + struct ceph_client *client = ceph_sb_to_client(dir->i_sb); + struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_mds_request *req; + int err; + + if (ceph_snap(dir) != CEPH_NOSNAP) + return -EROFS; + + dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest); + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS); + if (IS_ERR(req)) { + d_drop(dentry); + return PTR_ERR(req); + } + req->r_dentry = dget(dentry); + req->r_num_caps = 2; + req->r_path2 = kstrdup(dest, GFP_NOFS); + req->r_locked_dir = dir; + req->r_dentry_drop = CEPH_CAP_FILE_SHARED; + req->r_dentry_unless = CEPH_CAP_FILE_EXCL; + err = ceph_mdsc_do_request(mdsc, dir, req); + if (!err && !req->r_reply_info.head->is_dentry) + err = ceph_handle_notrace_create(dir, dentry); + ceph_mdsc_put_request(req); + if (err) + d_drop(dentry); + return err; +} + +static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + struct ceph_client *client = ceph_sb_to_client(dir->i_sb); + struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_mds_request *req; + int err = -EROFS; + int op; + + if (ceph_snap(dir) == CEPH_SNAPDIR) { + /* mkdir .snap/foo is a MKSNAP */ + op = CEPH_MDS_OP_MKSNAP; + dout("mksnap dir %p snap '%.*s' dn %p\n", dir, + dentry->d_name.len, dentry->d_name.name, dentry); + } else if (ceph_snap(dir) == CEPH_NOSNAP) { + dout("mkdir dir %p dn %p mode 0%o\n", dir, dentry, mode); + op = CEPH_MDS_OP_MKDIR; + } else { + goto out; + } + req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto out; + } + + req->r_dentry = dget(dentry); + req->r_num_caps = 2; + req->r_locked_dir = dir; + req->r_args.mkdir.mode = cpu_to_le32(mode); + req->r_dentry_drop = CEPH_CAP_FILE_SHARED; + req->r_dentry_unless = CEPH_CAP_FILE_EXCL; + err = ceph_mdsc_do_request(mdsc, dir, req); + if (!err && !req->r_reply_info.head->is_dentry) + err = ceph_handle_notrace_create(dir, dentry); + ceph_mdsc_put_request(req); +out: + if (err < 0) + d_drop(dentry); + return err; +} + +static int ceph_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) +{ + struct ceph_client *client = ceph_sb_to_client(dir->i_sb); + struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_mds_request *req; + int err; + + if (ceph_snap(dir) != CEPH_NOSNAP) + return -EROFS; + + dout("link in dir %p old_dentry %p dentry %p\n", dir, + old_dentry, dentry); + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LINK, USE_AUTH_MDS); + if (IS_ERR(req)) { + d_drop(dentry); + return PTR_ERR(req); + } + req->r_dentry = dget(dentry); + req->r_num_caps = 2; + req->r_old_dentry = dget(old_dentry); /* or inode? hrm. */ + req->r_locked_dir = dir; + req->r_dentry_drop = CEPH_CAP_FILE_SHARED; + req->r_dentry_unless = CEPH_CAP_FILE_EXCL; + err = ceph_mdsc_do_request(mdsc, dir, req); + if (err) + d_drop(dentry); + else if (!req->r_reply_info.head->is_dentry) + d_instantiate(dentry, igrab(old_dentry->d_inode)); + ceph_mdsc_put_request(req); + return err; +} + +/* + * For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps. If it + * looks like the link count will hit 0, drop any other caps (other + * than PIN) we don't specifically want (due to the file still being + * open). + */ +static int drop_caps_for_unlink(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL; + + spin_lock(&inode->i_lock); + if (inode->i_nlink == 1) { + drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN); + ci->i_ceph_flags |= CEPH_I_NODELAY; + } + spin_unlock(&inode->i_lock); + return drop; +} + +/* + * rmdir and unlink are differ only by the metadata op code + */ +static int ceph_unlink(struct inode *dir, struct dentry *dentry) +{ + struct ceph_client *client = ceph_sb_to_client(dir->i_sb); + struct ceph_mds_client *mdsc = &client->mdsc; + struct inode *inode = dentry->d_inode; + struct ceph_mds_request *req; + int err = -EROFS; + int op; + + if (ceph_snap(dir) == CEPH_SNAPDIR) { + /* rmdir .snap/foo is RMSNAP */ + dout("rmsnap dir %p '%.*s' dn %p\n", dir, dentry->d_name.len, + dentry->d_name.name, dentry); + op = CEPH_MDS_OP_RMSNAP; + } else if (ceph_snap(dir) == CEPH_NOSNAP) { + dout("unlink/rmdir dir %p dn %p inode %p\n", + dir, dentry, inode); + op = ((dentry->d_inode->i_mode & S_IFMT) == S_IFDIR) ? + CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK; + } else + goto out; + req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto out; + } + req->r_dentry = dget(dentry); + req->r_num_caps = 2; + req->r_locked_dir = dir; + req->r_dentry_drop = CEPH_CAP_FILE_SHARED; + req->r_dentry_unless = CEPH_CAP_FILE_EXCL; + req->r_inode_drop = drop_caps_for_unlink(inode); + err = ceph_mdsc_do_request(mdsc, dir, req); + if (!err && !req->r_reply_info.head->is_dentry) + d_delete(dentry); + ceph_mdsc_put_request(req); +out: + return err; +} + +static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct ceph_client *client = ceph_sb_to_client(old_dir->i_sb); + struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_mds_request *req; + int err; + + if (ceph_snap(old_dir) != ceph_snap(new_dir)) + return -EXDEV; + if (ceph_snap(old_dir) != CEPH_NOSNAP || + ceph_snap(new_dir) != CEPH_NOSNAP) + return -EROFS; + dout("rename dir %p dentry %p to dir %p dentry %p\n", + old_dir, old_dentry, new_dir, new_dentry); + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RENAME, USE_AUTH_MDS); + if (IS_ERR(req)) + return PTR_ERR(req); + req->r_dentry = dget(new_dentry); + req->r_num_caps = 2; + req->r_old_dentry = dget(old_dentry); + req->r_locked_dir = new_dir; + req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED; + req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL; + req->r_dentry_drop = CEPH_CAP_FILE_SHARED; + req->r_dentry_unless = CEPH_CAP_FILE_EXCL; + /* release LINK_RDCACHE on source inode (mds will lock it) */ + req->r_old_inode_drop = CEPH_CAP_LINK_SHARED; + if (new_dentry->d_inode) + req->r_inode_drop = drop_caps_for_unlink(new_dentry->d_inode); + err = ceph_mdsc_do_request(mdsc, old_dir, req); + if (!err && !req->r_reply_info.head->is_dentry) { + /* + * Normally d_move() is done by fill_trace (called by + * do_request, above). If there is no trace, we need + * to do it here. + */ + d_move(old_dentry, new_dentry); + } + ceph_mdsc_put_request(req); + return err; +} + + +/* + * Check if dentry lease is valid. If not, delete the lease. Try to + * renew if the least is more than half up. + */ +static int dentry_lease_is_valid(struct dentry *dentry) +{ + struct ceph_dentry_info *di; + struct ceph_mds_session *s; + int valid = 0; + u32 gen; + unsigned long ttl; + struct ceph_mds_session *session = NULL; + struct inode *dir = NULL; + u32 seq = 0; + + spin_lock(&dentry->d_lock); + di = ceph_dentry(dentry); + if (di && di->lease_session) { + s = di->lease_session; + spin_lock(&s->s_cap_lock); + gen = s->s_cap_gen; + ttl = s->s_cap_ttl; + spin_unlock(&s->s_cap_lock); + + if (di->lease_gen == gen && + time_before(jiffies, dentry->d_time) && + time_before(jiffies, ttl)) { + valid = 1; + if (di->lease_renew_after && + time_after(jiffies, di->lease_renew_after)) { + /* we should renew */ + dir = dentry->d_parent->d_inode; + session = ceph_get_mds_session(s); + seq = di->lease_seq; + di->lease_renew_after = 0; + di->lease_renew_from = jiffies; + } + } + } + spin_unlock(&dentry->d_lock); + + if (session) { + ceph_mdsc_lease_send_msg(session, dir, dentry, + CEPH_MDS_LEASE_RENEW, seq); + ceph_put_mds_session(session); + } + dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid); + return valid; +} + +/* + * Check if directory-wide content lease/cap is valid. + */ +static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry) +{ + struct ceph_inode_info *ci = ceph_inode(dir); + struct ceph_dentry_info *di = ceph_dentry(dentry); + int valid = 0; + + spin_lock(&dir->i_lock); + if (ci->i_shared_gen == di->lease_shared_gen) + valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1); + spin_unlock(&dir->i_lock); + dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n", + dir, (unsigned)ci->i_shared_gen, dentry, + (unsigned)di->lease_shared_gen, valid); + return valid; +} + +/* + * Check if cached dentry can be trusted. + */ +static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd) +{ + struct inode *dir = dentry->d_parent->d_inode; + + dout("d_revalidate %p '%.*s' inode %p\n", dentry, + dentry->d_name.len, dentry->d_name.name, dentry->d_inode); + + /* always trust cached snapped dentries, snapdir dentry */ + if (ceph_snap(dir) != CEPH_NOSNAP) { + dout("d_revalidate %p '%.*s' inode %p is SNAPPED\n", dentry, + dentry->d_name.len, dentry->d_name.name, dentry->d_inode); + goto out_touch; + } + if (dentry->d_inode && ceph_snap(dentry->d_inode) == CEPH_SNAPDIR) + goto out_touch; + + if (dentry_lease_is_valid(dentry) || + dir_lease_is_valid(dir, dentry)) + goto out_touch; + + dout("d_revalidate %p invalid\n", dentry); + d_drop(dentry); + return 0; +out_touch: + ceph_dentry_lru_touch(dentry); + return 1; +} + +/* + * When a dentry is released, clear the dir I_COMPLETE if it was part + * of the current dir gen. + */ +static void ceph_dentry_release(struct dentry *dentry) +{ + struct ceph_dentry_info *di = ceph_dentry(dentry); + struct inode *parent_inode = dentry->d_parent->d_inode; + + if (parent_inode) { + struct ceph_inode_info *ci = ceph_inode(parent_inode); + + spin_lock(&parent_inode->i_lock); + if (ci->i_shared_gen == di->lease_shared_gen) { + dout(" clearing %p complete (d_release)\n", + parent_inode); + ci->i_ceph_flags &= ~CEPH_I_COMPLETE; + ci->i_release_count++; + } + spin_unlock(&parent_inode->i_lock); + } + if (di) { + ceph_dentry_lru_del(dentry); + if (di->lease_session) + ceph_put_mds_session(di->lease_session); + kmem_cache_free(ceph_dentry_cachep, di); + dentry->d_fsdata = NULL; + } +} + +static int ceph_snapdir_d_revalidate(struct dentry *dentry, + struct nameidata *nd) +{ + /* + * Eventually, we'll want to revalidate snapped metadata + * too... probably... + */ + return 1; +} + + + +/* + * read() on a dir. This weird interface hack only works if mounted + * with '-o dirstat'. + */ +static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, + loff_t *ppos) +{ + struct ceph_file_info *cf = file->private_data; + struct inode *inode = file->f_dentry->d_inode; + struct ceph_inode_info *ci = ceph_inode(inode); + int left; + + if (!ceph_test_opt(ceph_client(inode->i_sb), DIRSTAT)) + return -EISDIR; + + if (!cf->dir_info) { + cf->dir_info = kmalloc(1024, GFP_NOFS); + if (!cf->dir_info) + return -ENOMEM; + cf->dir_info_len = + sprintf(cf->dir_info, + "entries: %20lld\n" + " files: %20lld\n" + " subdirs: %20lld\n" + "rentries: %20lld\n" + " rfiles: %20lld\n" + " rsubdirs: %20lld\n" + "rbytes: %20lld\n" + "rctime: %10ld.%09ld\n", + ci->i_files + ci->i_subdirs, + ci->i_files, + ci->i_subdirs, + ci->i_rfiles + ci->i_rsubdirs, + ci->i_rfiles, + ci->i_rsubdirs, + ci->i_rbytes, + (long)ci->i_rctime.tv_sec, + (long)ci->i_rctime.tv_nsec); + } + + if (*ppos >= cf->dir_info_len) + return 0; + size = min_t(unsigned, size, cf->dir_info_len-*ppos); + left = copy_to_user(buf, cf->dir_info + *ppos, size); + if (left == size) + return -EFAULT; + *ppos += (size - left); + return size - left; +} + +/* + * an fsync() on a dir will wait for any uncommitted directory + * operations to commit. + */ +static int ceph_dir_fsync(struct file *file, struct dentry *dentry, + int datasync) +{ + struct inode *inode = dentry->d_inode; + struct ceph_inode_info *ci = ceph_inode(inode); + struct list_head *head = &ci->i_unsafe_dirops; + struct ceph_mds_request *req; + u64 last_tid; + int ret = 0; + + dout("dir_fsync %p\n", inode); + spin_lock(&ci->i_unsafe_lock); + if (list_empty(head)) + goto out; + + req = list_entry(head->prev, + struct ceph_mds_request, r_unsafe_dir_item); + last_tid = req->r_tid; + + do { + ceph_mdsc_get_request(req); + spin_unlock(&ci->i_unsafe_lock); + dout("dir_fsync %p wait on tid %llu (until %llu)\n", + inode, req->r_tid, last_tid); + if (req->r_timeout) { + ret = wait_for_completion_timeout( + &req->r_safe_completion, req->r_timeout); + if (ret > 0) + ret = 0; + else if (ret == 0) + ret = -EIO; /* timed out */ + } else { + wait_for_completion(&req->r_safe_completion); + } + spin_lock(&ci->i_unsafe_lock); + ceph_mdsc_put_request(req); + + if (ret || list_empty(head)) + break; + req = list_entry(head->next, + struct ceph_mds_request, r_unsafe_dir_item); + } while (req->r_tid < last_tid); +out: + spin_unlock(&ci->i_unsafe_lock); + return ret; +} + +/* + * We maintain a private dentry LRU. + * + * FIXME: this needs to be changed to a per-mds lru to be useful. + */ +void ceph_dentry_lru_add(struct dentry *dn) +{ + struct ceph_dentry_info *di = ceph_dentry(dn); + struct ceph_mds_client *mdsc; + + dout("dentry_lru_add %p %p '%.*s'\n", di, dn, + dn->d_name.len, dn->d_name.name); + if (di) { + mdsc = &ceph_client(dn->d_sb)->mdsc; + spin_lock(&mdsc->dentry_lru_lock); + list_add_tail(&di->lru, &mdsc->dentry_lru); + mdsc->num_dentry++; + spin_unlock(&mdsc->dentry_lru_lock); + } +} + +void ceph_dentry_lru_touch(struct dentry *dn) +{ + struct ceph_dentry_info *di = ceph_dentry(dn); + struct ceph_mds_client *mdsc; + + dout("dentry_lru_touch %p %p '%.*s'\n", di, dn, + dn->d_name.len, dn->d_name.name); + if (di) { + mdsc = &ceph_client(dn->d_sb)->mdsc; + spin_lock(&mdsc->dentry_lru_lock); + list_move_tail(&di->lru, &mdsc->dentry_lru); + spin_unlock(&mdsc->dentry_lru_lock); + } +} + +void ceph_dentry_lru_del(struct dentry *dn) +{ + struct ceph_dentry_info *di = ceph_dentry(dn); + struct ceph_mds_client *mdsc; + + dout("dentry_lru_del %p %p '%.*s'\n", di, dn, + dn->d_name.len, dn->d_name.name); + if (di) { + mdsc = &ceph_client(dn->d_sb)->mdsc; + spin_lock(&mdsc->dentry_lru_lock); + list_del_init(&di->lru); + mdsc->num_dentry--; + spin_unlock(&mdsc->dentry_lru_lock); + } +} + +const struct file_operations ceph_dir_fops = { + .read = ceph_read_dir, + .readdir = ceph_readdir, + .llseek = ceph_dir_llseek, + .open = ceph_open, + .release = ceph_release, + .unlocked_ioctl = ceph_ioctl, + .fsync = ceph_dir_fsync, +}; + +const struct inode_operations ceph_dir_iops = { + .lookup = ceph_lookup, + .permission = ceph_permission, + .getattr = ceph_getattr, + .setattr = ceph_setattr, + .setxattr = ceph_setxattr, + .getxattr = ceph_getxattr, + .listxattr = ceph_listxattr, + .removexattr = ceph_removexattr, + .mknod = ceph_mknod, + .symlink = ceph_symlink, + .mkdir = ceph_mkdir, + .link = ceph_link, + .unlink = ceph_unlink, + .rmdir = ceph_unlink, + .rename = ceph_rename, + .create = ceph_create, +}; + +struct dentry_operations ceph_dentry_ops = { + .d_revalidate = ceph_d_revalidate, + .d_release = ceph_dentry_release, +}; + +struct dentry_operations ceph_snapdir_dentry_ops = { + .d_revalidate = ceph_snapdir_d_revalidate, +}; + +struct dentry_operations ceph_snap_dentry_ops = { +}; diff --git a/fs/ceph/export.c b/fs/ceph/export.c new file mode 100644 index 00000000000..fc68e39cbad --- /dev/null +++ b/fs/ceph/export.c @@ -0,0 +1,223 @@ +#include "ceph_debug.h" + +#include <linux/exportfs.h> +#include <asm/unaligned.h> + +#include "super.h" + +/* + * NFS export support + * + * NFS re-export of a ceph mount is, at present, only semireliable. + * The basic issue is that the Ceph architectures doesn't lend itself + * well to generating filehandles that will remain valid forever. + * + * So, we do our best. If you're lucky, your inode will be in the + * client's cache. If it's not, and you have a connectable fh, then + * the MDS server may be able to find it for you. Otherwise, you get + * ESTALE. + * + * There are ways to this more reliable, but in the non-connectable fh + * case, we won't every work perfectly, and in the connectable case, + * some changes are needed on the MDS side to work better. + */ + +/* + * Basic fh + */ +struct ceph_nfs_fh { + u64 ino; +} __attribute__ ((packed)); + +/* + * Larger 'connectable' fh that includes parent ino and name hash. + * Use this whenever possible, as it works more reliably. + */ +struct ceph_nfs_confh { + u64 ino, parent_ino; + u32 parent_name_hash; +} __attribute__ ((packed)); + +static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len, + int connectable) +{ + struct ceph_nfs_fh *fh = (void *)rawfh; + struct ceph_nfs_confh *cfh = (void *)rawfh; + struct dentry *parent = dentry->d_parent; + struct inode *inode = dentry->d_inode; + int type; + + /* don't re-export snaps */ + if (ceph_snap(inode) != CEPH_NOSNAP) + return -EINVAL; + + if (*max_len >= sizeof(*cfh)) { + dout("encode_fh %p connectable\n", dentry); + cfh->ino = ceph_ino(dentry->d_inode); + cfh->parent_ino = ceph_ino(parent->d_inode); + cfh->parent_name_hash = parent->d_name.hash; + *max_len = sizeof(*cfh); + type = 2; + } else if (*max_len > sizeof(*fh)) { + if (connectable) + return -ENOSPC; + dout("encode_fh %p\n", dentry); + fh->ino = ceph_ino(dentry->d_inode); + *max_len = sizeof(*fh); + type = 1; + } else { + return -ENOSPC; + } + return type; +} + +/* + * convert regular fh to dentry + * + * FIXME: we should try harder by querying the mds for the ino. + */ +static struct dentry *__fh_to_dentry(struct super_block *sb, + struct ceph_nfs_fh *fh) +{ + struct inode *inode; + struct dentry *dentry; + struct ceph_vino vino; + int err; + + dout("__fh_to_dentry %llx\n", fh->ino); + vino.ino = fh->ino; + vino.snap = CEPH_NOSNAP; + inode = ceph_find_inode(sb, vino); + if (!inode) + return ERR_PTR(-ESTALE); + + dentry = d_obtain_alias(inode); + if (!dentry) { + pr_err("fh_to_dentry %llx -- inode %p but ENOMEM\n", + fh->ino, inode); + iput(inode); + return ERR_PTR(-ENOMEM); + } + err = ceph_init_dentry(dentry); + + if (err < 0) { + iput(inode); + return ERR_PTR(err); + } + dout("__fh_to_dentry %llx %p dentry %p\n", fh->ino, inode, dentry); + return dentry; +} + +/* + * convert connectable fh to dentry + */ +static struct dentry *__cfh_to_dentry(struct super_block *sb, + struct ceph_nfs_confh *cfh) +{ + struct ceph_mds_client *mdsc = &ceph_client(sb)->mdsc; + struct inode *inode; + struct dentry *dentry; + struct ceph_vino vino; + int err; + + dout("__cfh_to_dentry %llx (%llx/%x)\n", + cfh->ino, cfh->parent_ino, cfh->parent_name_hash); + + vino.ino = cfh->ino; + vino.snap = CEPH_NOSNAP; + inode = ceph_find_inode(sb, vino); + if (!inode) { + struct ceph_mds_request *req; + + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPHASH, + USE_ANY_MDS); + if (IS_ERR(req)) + return ERR_PTR(PTR_ERR(req)); + + req->r_ino1 = vino; + req->r_ino2.ino = cfh->parent_ino; + req->r_ino2.snap = CEPH_NOSNAP; + req->r_path2 = kmalloc(16, GFP_NOFS); + snprintf(req->r_path2, 16, "%d", cfh->parent_name_hash); + req->r_num_caps = 1; + err = ceph_mdsc_do_request(mdsc, NULL, req); + ceph_mdsc_put_request(req); + inode = ceph_find_inode(sb, vino); + if (!inode) + return ERR_PTR(err ? err : -ESTALE); + } + + dentry = d_obtain_alias(inode); + if (!dentry) { + pr_err("cfh_to_dentry %llx -- inode %p but ENOMEM\n", + cfh->ino, inode); + iput(inode); + return ERR_PTR(-ENOMEM); + } + err = ceph_init_dentry(dentry); + if (err < 0) { + iput(inode); + return ERR_PTR(err); + } + dout("__cfh_to_dentry %llx %p dentry %p\n", cfh->ino, inode, dentry); + return dentry; +} + +static struct dentry *ceph_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + if (fh_type == 1) + return __fh_to_dentry(sb, (struct ceph_nfs_fh *)fid->raw); + else + return __cfh_to_dentry(sb, (struct ceph_nfs_confh *)fid->raw); +} + +/* + * get parent, if possible. + * + * FIXME: we could do better by querying the mds to discover the + * parent. + */ +static struct dentry *ceph_fh_to_parent(struct super_block *sb, + struct fid *fid, + int fh_len, int fh_type) +{ + struct ceph_nfs_confh *cfh = (void *)fid->raw; + struct ceph_vino vino; + struct inode *inode; + struct dentry *dentry; + int err; + + if (fh_type == 1) + return ERR_PTR(-ESTALE); + + pr_debug("fh_to_parent %llx/%d\n", cfh->parent_ino, + cfh->parent_name_hash); + + vino.ino = cfh->ino; + vino.snap = CEPH_NOSNAP; + inode = ceph_find_inode(sb, vino); + if (!inode) + return ERR_PTR(-ESTALE); + + dentry = d_obtain_alias(inode); + if (!dentry) { + pr_err("fh_to_parent %llx -- inode %p but ENOMEM\n", + cfh->ino, inode); + iput(inode); + return ERR_PTR(-ENOMEM); + } + err = ceph_init_dentry(dentry); + if (err < 0) { + iput(inode); + return ERR_PTR(err); + } + dout("fh_to_parent %llx %p dentry %p\n", cfh->ino, inode, dentry); + return dentry; +} + +const struct export_operations ceph_export_ops = { + .encode_fh = ceph_encode_fh, + .fh_to_dentry = ceph_fh_to_dentry, + .fh_to_parent = ceph_fh_to_parent, +}; diff --git a/fs/ceph/file.c b/fs/ceph/file.c new file mode 100644 index 00000000000..5d2af8464f6 --- /dev/null +++ b/fs/ceph/file.c @@ -0,0 +1,937 @@ +#include "ceph_debug.h" + +#include <linux/sched.h> +#include <linux/file.h> +#include <linux/namei.h> +#include <linux/writeback.h> + +#include "super.h" +#include "mds_client.h" + +/* + * Ceph file operations + * + * Implement basic open/close functionality, and implement + * read/write. + * + * We implement three modes of file I/O: + * - buffered uses the generic_file_aio_{read,write} helpers + * + * - synchronous is used when there is multi-client read/write + * sharing, avoids the page cache, and synchronously waits for an + * ack from the OSD. + * + * - direct io takes the variant of the sync path that references + * user pages directly. + * + * fsync() flushes and waits on dirty pages, but just queues metadata + * for writeback: since the MDS can recover size and mtime there is no + * need to wait for MDS acknowledgement. + */ + + +/* + * Prepare an open request. Preallocate ceph_cap to avoid an + * inopportune ENOMEM later. + */ +static struct ceph_mds_request * +prepare_open_request(struct super_block *sb, int flags, int create_mode) +{ + struct ceph_client *client = ceph_sb_to_client(sb); + struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_mds_request *req; + int want_auth = USE_ANY_MDS; + int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN; + + if (flags & (O_WRONLY|O_RDWR|O_CREAT|O_TRUNC)) + want_auth = USE_AUTH_MDS; + + req = ceph_mdsc_create_request(mdsc, op, want_auth); + if (IS_ERR(req)) + goto out; + req->r_fmode = ceph_flags_to_mode(flags); + req->r_args.open.flags = cpu_to_le32(flags); + req->r_args.open.mode = cpu_to_le32(create_mode); + req->r_args.open.preferred = cpu_to_le32(-1); +out: + return req; +} + +/* + * initialize private struct file data. + * if we fail, clean up by dropping fmode reference on the ceph_inode + */ +static int ceph_init_file(struct inode *inode, struct file *file, int fmode) +{ + struct ceph_file_info *cf; + int ret = 0; + + switch (inode->i_mode & S_IFMT) { + case S_IFREG: + case S_IFDIR: + dout("init_file %p %p 0%o (regular)\n", inode, file, + inode->i_mode); + cf = kmem_cache_alloc(ceph_file_cachep, GFP_NOFS | __GFP_ZERO); + if (cf == NULL) { + ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ + return -ENOMEM; + } + cf->fmode = fmode; + cf->next_offset = 2; + file->private_data = cf; + BUG_ON(inode->i_fop->release != ceph_release); + break; + + case S_IFLNK: + dout("init_file %p %p 0%o (symlink)\n", inode, file, + inode->i_mode); + ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ + break; + + default: + dout("init_file %p %p 0%o (special)\n", inode, file, + inode->i_mode); + /* + * we need to drop the open ref now, since we don't + * have .release set to ceph_release. + */ + ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ + BUG_ON(inode->i_fop->release == ceph_release); + + /* call the proper open fop */ + ret = inode->i_fop->open(inode, file); + } + return ret; +} + +/* + * If the filp already has private_data, that means the file was + * already opened by intent during lookup, and we do nothing. + * + * If we already have the requisite capabilities, we can satisfy + * the open request locally (no need to request new caps from the + * MDS). We do, however, need to inform the MDS (asynchronously) + * if our wanted caps set expands. + */ +int ceph_open(struct inode *inode, struct file *file) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_client *client = ceph_sb_to_client(inode->i_sb); + struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_mds_request *req; + struct ceph_file_info *cf = file->private_data; + struct inode *parent_inode = file->f_dentry->d_parent->d_inode; + int err; + int flags, fmode, wanted; + + if (cf) { + dout("open file %p is already opened\n", file); + return 0; + } + + /* filter out O_CREAT|O_EXCL; vfs did that already. yuck. */ + flags = file->f_flags & ~(O_CREAT|O_EXCL); + if (S_ISDIR(inode->i_mode)) + flags = O_DIRECTORY; /* mds likes to know */ + + dout("open inode %p ino %llx.%llx file %p flags %d (%d)\n", inode, + ceph_vinop(inode), file, flags, file->f_flags); + fmode = ceph_flags_to_mode(flags); + wanted = ceph_caps_for_mode(fmode); + + /* snapped files are read-only */ + if (ceph_snap(inode) != CEPH_NOSNAP && (file->f_mode & FMODE_WRITE)) + return -EROFS; + + /* trivially open snapdir */ + if (ceph_snap(inode) == CEPH_SNAPDIR) { + spin_lock(&inode->i_lock); + __ceph_get_fmode(ci, fmode); + spin_unlock(&inode->i_lock); + return ceph_init_file(inode, file, fmode); + } + + /* + * No need to block if we have any caps. Update wanted set + * asynchronously. + */ + spin_lock(&inode->i_lock); + if (__ceph_is_any_real_caps(ci)) { + int mds_wanted = __ceph_caps_mds_wanted(ci); + int issued = __ceph_caps_issued(ci, NULL); + + dout("open %p fmode %d want %s issued %s using existing\n", + inode, fmode, ceph_cap_string(wanted), + ceph_cap_string(issued)); + __ceph_get_fmode(ci, fmode); + spin_unlock(&inode->i_lock); + + /* adjust wanted? */ + if ((issued & wanted) != wanted && + (mds_wanted & wanted) != wanted && + ceph_snap(inode) != CEPH_SNAPDIR) + ceph_check_caps(ci, 0, NULL); + + return ceph_init_file(inode, file, fmode); + } else if (ceph_snap(inode) != CEPH_NOSNAP && + (ci->i_snap_caps & wanted) == wanted) { + __ceph_get_fmode(ci, fmode); + spin_unlock(&inode->i_lock); + return ceph_init_file(inode, file, fmode); + } + spin_unlock(&inode->i_lock); + + dout("open fmode %d wants %s\n", fmode, ceph_cap_string(wanted)); + req = prepare_open_request(inode->i_sb, flags, 0); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto out; + } + req->r_inode = igrab(inode); + req->r_num_caps = 1; + err = ceph_mdsc_do_request(mdsc, parent_inode, req); + if (!err) + err = ceph_init_file(inode, file, req->r_fmode); + ceph_mdsc_put_request(req); + dout("open result=%d on %llx.%llx\n", err, ceph_vinop(inode)); +out: + return err; +} + + +/* + * Do a lookup + open with a single request. + * + * If this succeeds, but some subsequent check in the vfs + * may_open() fails, the struct *file gets cleaned up (i.e. + * ceph_release gets called). So fear not! + */ +/* + * flags + * path_lookup_open -> LOOKUP_OPEN + * path_lookup_create -> LOOKUP_OPEN|LOOKUP_CREATE + */ +struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry, + struct nameidata *nd, int mode, + int locked_dir) +{ + struct ceph_client *client = ceph_sb_to_client(dir->i_sb); + struct ceph_mds_client *mdsc = &client->mdsc; + struct file *file = nd->intent.open.file; + struct inode *parent_inode = get_dentry_parent_inode(file->f_dentry); + struct ceph_mds_request *req; + int err; + int flags = nd->intent.open.flags - 1; /* silly vfs! */ + + dout("ceph_lookup_open dentry %p '%.*s' flags %d mode 0%o\n", + dentry, dentry->d_name.len, dentry->d_name.name, flags, mode); + + /* do the open */ + req = prepare_open_request(dir->i_sb, flags, mode); + if (IS_ERR(req)) + return ERR_PTR(PTR_ERR(req)); + req->r_dentry = dget(dentry); + req->r_num_caps = 2; + if (flags & O_CREAT) { + req->r_dentry_drop = CEPH_CAP_FILE_SHARED; + req->r_dentry_unless = CEPH_CAP_FILE_EXCL; + } + req->r_locked_dir = dir; /* caller holds dir->i_mutex */ + err = ceph_mdsc_do_request(mdsc, parent_inode, req); + dentry = ceph_finish_lookup(req, dentry, err); + if (!err && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry) + err = ceph_handle_notrace_create(dir, dentry); + if (!err) + err = ceph_init_file(req->r_dentry->d_inode, file, + req->r_fmode); + ceph_mdsc_put_request(req); + dout("ceph_lookup_open result=%p\n", dentry); + return dentry; +} + +int ceph_release(struct inode *inode, struct file *file) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_file_info *cf = file->private_data; + + dout("release inode %p file %p\n", inode, file); + ceph_put_fmode(ci, cf->fmode); + if (cf->last_readdir) + ceph_mdsc_put_request(cf->last_readdir); + kfree(cf->last_name); + kfree(cf->dir_info); + dput(cf->dentry); + kmem_cache_free(ceph_file_cachep, cf); + + /* wake up anyone waiting for caps on this inode */ + wake_up(&ci->i_cap_wq); + return 0; +} + +/* + * build a vector of user pages + */ +static struct page **get_direct_page_vector(const char __user *data, + int num_pages, + loff_t off, size_t len) +{ + struct page **pages; + int rc; + + pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS); + if (!pages) + return ERR_PTR(-ENOMEM); + + down_read(¤t->mm->mmap_sem); + rc = get_user_pages(current, current->mm, (unsigned long)data, + num_pages, 0, 0, pages, NULL); + up_read(¤t->mm->mmap_sem); + if (rc < 0) + goto fail; + return pages; + +fail: + kfree(pages); + return ERR_PTR(rc); +} + +static void put_page_vector(struct page **pages, int num_pages) +{ + int i; + + for (i = 0; i < num_pages; i++) + put_page(pages[i]); + kfree(pages); +} + +void ceph_release_page_vector(struct page **pages, int num_pages) +{ + int i; + + for (i = 0; i < num_pages; i++) + __free_pages(pages[i], 0); + kfree(pages); +} + +/* + * allocate a vector new pages + */ +static struct page **alloc_page_vector(int num_pages) +{ + struct page **pages; + int i; + + pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS); + if (!pages) + return ERR_PTR(-ENOMEM); + for (i = 0; i < num_pages; i++) { + pages[i] = alloc_page(GFP_NOFS); + if (pages[i] == NULL) { + ceph_release_page_vector(pages, i); + return ERR_PTR(-ENOMEM); + } + } + return pages; +} + +/* + * copy user data into a page vector + */ +static int copy_user_to_page_vector(struct page **pages, + const char __user *data, + loff_t off, size_t len) +{ + int i = 0; + int po = off & ~PAGE_CACHE_MASK; + int left = len; + int l, bad; + + while (left > 0) { + l = min_t(int, PAGE_CACHE_SIZE-po, left); + bad = copy_from_user(page_address(pages[i]) + po, data, l); + if (bad == l) + return -EFAULT; + data += l - bad; + left -= l - bad; + po += l - bad; + if (po == PAGE_CACHE_SIZE) { + po = 0; + i++; + } + } + return len; +} + +/* + * copy user data from a page vector into a user pointer + */ +static int copy_page_vector_to_user(struct page **pages, char __user *data, + loff_t off, size_t len) +{ + int i = 0; + int po = off & ~PAGE_CACHE_MASK; + int left = len; + int l, bad; + + while (left > 0) { + l = min_t(int, left, PAGE_CACHE_SIZE-po); + bad = copy_to_user(data, page_address(pages[i]) + po, l); + if (bad == l) + return -EFAULT; + data += l - bad; + left -= l - bad; + if (po) { + po += l - bad; + if (po == PAGE_CACHE_SIZE) + po = 0; + } + i++; + } + return len; +} + +/* + * Zero an extent within a page vector. Offset is relative to the + * start of the first page. + */ +static void zero_page_vector_range(int off, int len, struct page **pages) +{ + int i = off >> PAGE_CACHE_SHIFT; + + off &= ~PAGE_CACHE_MASK; + + dout("zero_page_vector_page %u~%u\n", off, len); + + /* leading partial page? */ + if (off) { + int end = min((int)PAGE_CACHE_SIZE, off + len); + dout("zeroing %d %p head from %d\n", i, pages[i], + (int)off); + zero_user_segment(pages[i], off, end); + len -= (end - off); + i++; + } + while (len >= PAGE_CACHE_SIZE) { + dout("zeroing %d %p len=%d\n", i, pages[i], len); + zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE); + len -= PAGE_CACHE_SIZE; + i++; + } + /* trailing partial page? */ + if (len) { + dout("zeroing %d %p tail to %d\n", i, pages[i], (int)len); + zero_user_segment(pages[i], 0, len); + } +} + + +/* + * Read a range of bytes striped over one or more objects. Iterate over + * objects we stripe over. (That's not atomic, but good enough for now.) + * + * If we get a short result from the OSD, check against i_size; we need to + * only return a short read to the caller if we hit EOF. + */ +static int striped_read(struct inode *inode, + u64 off, u64 len, + struct page **pages, int num_pages, + int *checkeof) +{ + struct ceph_client *client = ceph_inode_to_client(inode); + struct ceph_inode_info *ci = ceph_inode(inode); + u64 pos, this_len; + int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */ + int left, pages_left; + int read; + struct page **page_pos; + int ret; + bool hit_stripe, was_short; + + /* + * we may need to do multiple reads. not atomic, unfortunately. + */ + pos = off; + left = len; + page_pos = pages; + pages_left = num_pages; + read = 0; + +more: + this_len = left; + ret = ceph_osdc_readpages(&client->osdc, ceph_vino(inode), + &ci->i_layout, pos, &this_len, + ci->i_truncate_seq, + ci->i_truncate_size, + page_pos, pages_left); + hit_stripe = this_len < left; + was_short = ret >= 0 && ret < this_len; + if (ret == -ENOENT) + ret = 0; + dout("striped_read %llu~%u (read %u) got %d%s%s\n", pos, left, read, + ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : ""); + + if (ret > 0) { + int didpages = + ((pos & ~PAGE_CACHE_MASK) + ret) >> PAGE_CACHE_SHIFT; + + if (read < pos - off) { + dout(" zero gap %llu to %llu\n", off + read, pos); + zero_page_vector_range(page_off + read, + pos - off - read, pages); + } + pos += ret; + read = pos - off; + left -= ret; + page_pos += didpages; + pages_left -= didpages; + + /* hit stripe? */ + if (left && hit_stripe) + goto more; + } + + if (was_short) { + /* was original extent fully inside i_size? */ + if (pos + left <= inode->i_size) { + dout("zero tail\n"); + zero_page_vector_range(page_off + read, len - read, + pages); + read = len; + goto out; + } + + /* check i_size */ + *checkeof = 1; + } + +out: + if (ret >= 0) + ret = read; + dout("striped_read returns %d\n", ret); + return ret; +} + +/* + * Completely synchronous read and write methods. Direct from __user + * buffer to osd, or directly to user pages (if O_DIRECT). + * + * If the read spans object boundary, just do multiple reads. + */ +static ssize_t ceph_sync_read(struct file *file, char __user *data, + unsigned len, loff_t *poff, int *checkeof) +{ + struct inode *inode = file->f_dentry->d_inode; + struct page **pages; + u64 off = *poff; + int num_pages = calc_pages_for(off, len); + int ret; + + dout("sync_read on file %p %llu~%u %s\n", file, off, len, + (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); + + if (file->f_flags & O_DIRECT) { + pages = get_direct_page_vector(data, num_pages, off, len); + + /* + * flush any page cache pages in this range. this + * will make concurrent normal and O_DIRECT io slow, + * but it will at least behave sensibly when they are + * in sequence. + */ + } else { + pages = alloc_page_vector(num_pages); + } + if (IS_ERR(pages)) + return PTR_ERR(pages); + + ret = filemap_write_and_wait(inode->i_mapping); + if (ret < 0) + goto done; + + ret = striped_read(inode, off, len, pages, num_pages, checkeof); + + if (ret >= 0 && (file->f_flags & O_DIRECT) == 0) + ret = copy_page_vector_to_user(pages, data, off, ret); + if (ret >= 0) + *poff = off + ret; + +done: + if (file->f_flags & O_DIRECT) + put_page_vector(pages, num_pages); + else + ceph_release_page_vector(pages, num_pages); + dout("sync_read result %d\n", ret); + return ret; +} + +/* + * Write commit callback, called if we requested both an ACK and + * ONDISK commit reply from the OSD. + */ +static void sync_write_commit(struct ceph_osd_request *req, + struct ceph_msg *msg) +{ + struct ceph_inode_info *ci = ceph_inode(req->r_inode); + + dout("sync_write_commit %p tid %llu\n", req, req->r_tid); + spin_lock(&ci->i_unsafe_lock); + list_del_init(&req->r_unsafe_item); + spin_unlock(&ci->i_unsafe_lock); + ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR); +} + +/* + * Synchronous write, straight from __user pointer or user pages (if + * O_DIRECT). + * + * If write spans object boundary, just do multiple writes. (For a + * correct atomic write, we should e.g. take write locks on all + * objects, rollback on failure, etc.) + */ +static ssize_t ceph_sync_write(struct file *file, const char __user *data, + size_t left, loff_t *offset) +{ + struct inode *inode = file->f_dentry->d_inode; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_client *client = ceph_inode_to_client(inode); + struct ceph_osd_request *req; + struct page **pages; + int num_pages; + long long unsigned pos; + u64 len; + int written = 0; + int flags; + int do_sync = 0; + int check_caps = 0; + int ret; + struct timespec mtime = CURRENT_TIME; + + if (ceph_snap(file->f_dentry->d_inode) != CEPH_NOSNAP) + return -EROFS; + + dout("sync_write on file %p %lld~%u %s\n", file, *offset, + (unsigned)left, (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); + + if (file->f_flags & O_APPEND) + pos = i_size_read(inode); + else + pos = *offset; + + ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left); + if (ret < 0) + return ret; + + ret = invalidate_inode_pages2_range(inode->i_mapping, + pos >> PAGE_CACHE_SHIFT, + (pos + left) >> PAGE_CACHE_SHIFT); + if (ret < 0) + dout("invalidate_inode_pages2_range returned %d\n", ret); + + flags = CEPH_OSD_FLAG_ORDERSNAP | + CEPH_OSD_FLAG_ONDISK | + CEPH_OSD_FLAG_WRITE; + if ((file->f_flags & (O_SYNC|O_DIRECT)) == 0) + flags |= CEPH_OSD_FLAG_ACK; + else + do_sync = 1; + + /* + * we may need to do multiple writes here if we span an object + * boundary. this isn't atomic, unfortunately. :( + */ +more: + len = left; + req = ceph_osdc_new_request(&client->osdc, &ci->i_layout, + ceph_vino(inode), pos, &len, + CEPH_OSD_OP_WRITE, flags, + ci->i_snap_realm->cached_context, + do_sync, + ci->i_truncate_seq, ci->i_truncate_size, + &mtime, false, 2); + if (IS_ERR(req)) + return PTR_ERR(req); + + num_pages = calc_pages_for(pos, len); + + if (file->f_flags & O_DIRECT) { + pages = get_direct_page_vector(data, num_pages, pos, len); + if (IS_ERR(pages)) { + ret = PTR_ERR(pages); + goto out; + } + + /* + * throw out any page cache pages in this range. this + * may block. + */ + truncate_inode_pages_range(inode->i_mapping, pos, pos+len); + } else { + pages = alloc_page_vector(num_pages); + if (IS_ERR(pages)) { + ret = PTR_ERR(pages); + goto out; + } + ret = copy_user_to_page_vector(pages, data, pos, len); + if (ret < 0) { + ceph_release_page_vector(pages, num_pages); + goto out; + } + + if ((file->f_flags & O_SYNC) == 0) { + /* get a second commit callback */ + req->r_safe_callback = sync_write_commit; + req->r_own_pages = 1; + } + } + req->r_pages = pages; + req->r_num_pages = num_pages; + req->r_inode = inode; + + ret = ceph_osdc_start_request(&client->osdc, req, false); + if (!ret) { + if (req->r_safe_callback) { + /* + * Add to inode unsafe list only after we + * start_request so that a tid has been assigned. + */ + spin_lock(&ci->i_unsafe_lock); + list_add(&ci->i_unsafe_writes, &req->r_unsafe_item); + spin_unlock(&ci->i_unsafe_lock); + ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR); + } + ret = ceph_osdc_wait_request(&client->osdc, req); + } + + if (file->f_flags & O_DIRECT) + put_page_vector(pages, num_pages); + else if (file->f_flags & O_SYNC) + ceph_release_page_vector(pages, num_pages); + +out: + ceph_osdc_put_request(req); + if (ret == 0) { + pos += len; + written += len; + left -= len; + if (left) + goto more; + + ret = written; + *offset = pos; + if (pos > i_size_read(inode)) + check_caps = ceph_inode_set_size(inode, pos); + if (check_caps) + ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, + NULL); + } + return ret; +} + +/* + * Wrap generic_file_aio_read with checks for cap bits on the inode. + * Atomically grab references, so that those bits are not released + * back to the MDS mid-read. + * + * Hmm, the sync read case isn't actually async... should it be? + */ +static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct file *filp = iocb->ki_filp; + loff_t *ppos = &iocb->ki_pos; + size_t len = iov->iov_len; + struct inode *inode = filp->f_dentry->d_inode; + struct ceph_inode_info *ci = ceph_inode(inode); + void *base = iov->iov_base; + ssize_t ret; + int got = 0; + int checkeof = 0, read = 0; + + dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n", + inode, ceph_vinop(inode), pos, (unsigned)len, inode); +again: + __ceph_do_pending_vmtruncate(inode); + ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_CACHE, + &got, -1); + if (ret < 0) + goto out; + dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n", + inode, ceph_vinop(inode), pos, (unsigned)len, + ceph_cap_string(got)); + + if ((got & CEPH_CAP_FILE_CACHE) == 0 || + (iocb->ki_filp->f_flags & O_DIRECT) || + (inode->i_sb->s_flags & MS_SYNCHRONOUS)) + /* hmm, this isn't really async... */ + ret = ceph_sync_read(filp, base, len, ppos, &checkeof); + else + ret = generic_file_aio_read(iocb, iov, nr_segs, pos); + +out: + dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n", + inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret); + ceph_put_cap_refs(ci, got); + + if (checkeof && ret >= 0) { + int statret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE); + + /* hit EOF or hole? */ + if (statret == 0 && *ppos < inode->i_size) { + dout("aio_read sync_read hit hole, reading more\n"); + read += ret; + base += ret; + len -= ret; + checkeof = 0; + goto again; + } + } + if (ret >= 0) + ret += read; + + return ret; +} + +/* + * Take cap references to avoid releasing caps to MDS mid-write. + * + * If we are synchronous, and write with an old snap context, the OSD + * may return EOLDSNAPC. In that case, retry the write.. _after_ + * dropping our cap refs and allowing the pending snap to logically + * complete _before_ this write occurs. + * + * If we are near ENOSPC, write synchronously. + */ +static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_dentry->d_inode; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc; + loff_t endoff = pos + iov->iov_len; + int got = 0; + int ret, err; + + if (ceph_snap(inode) != CEPH_NOSNAP) + return -EROFS; + +retry_snap: + if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) + return -ENOSPC; + __ceph_do_pending_vmtruncate(inode); + dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n", + inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, + inode->i_size); + ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, + &got, endoff); + if (ret < 0) + goto out; + + dout("aio_write %p %llx.%llx %llu~%u got cap refs on %s\n", + inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, + ceph_cap_string(got)); + + if ((got & CEPH_CAP_FILE_BUFFER) == 0 || + (iocb->ki_filp->f_flags & O_DIRECT) || + (inode->i_sb->s_flags & MS_SYNCHRONOUS)) { + ret = ceph_sync_write(file, iov->iov_base, iov->iov_len, + &iocb->ki_pos); + } else { + ret = generic_file_aio_write(iocb, iov, nr_segs, pos); + + if ((ret >= 0 || ret == -EIOCBQUEUED) && + ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host) + || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) { + err = vfs_fsync_range(file, file->f_path.dentry, + pos, pos + ret - 1, 1); + if (err < 0) + ret = err; + } + } + if (ret >= 0) { + spin_lock(&inode->i_lock); + __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); + spin_unlock(&inode->i_lock); + } + +out: + dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n", + inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, + ceph_cap_string(got)); + ceph_put_cap_refs(ci, got); + + if (ret == -EOLDSNAPC) { + dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n", + inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len); + goto retry_snap; + } + + return ret; +} + +/* + * llseek. be sure to verify file size on SEEK_END. + */ +static loff_t ceph_llseek(struct file *file, loff_t offset, int origin) +{ + struct inode *inode = file->f_mapping->host; + int ret; + + mutex_lock(&inode->i_mutex); + __ceph_do_pending_vmtruncate(inode); + switch (origin) { + case SEEK_END: + ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE); + if (ret < 0) { + offset = ret; + goto out; + } + offset += inode->i_size; + break; + case SEEK_CUR: + /* + * Here we special-case the lseek(fd, 0, SEEK_CUR) + * position-querying operation. Avoid rewriting the "same" + * f_pos value back to the file because a concurrent read(), + * write() or lseek() might have altered it + */ + if (offset == 0) { + offset = file->f_pos; + goto out; + } + offset += file->f_pos; + break; + } + + if (offset < 0 || offset > inode->i_sb->s_maxbytes) { + offset = -EINVAL; + goto out; + } + + /* Special lock needed here? */ + if (offset != file->f_pos) { + file->f_pos = offset; + file->f_version = 0; + } + +out: + mutex_unlock(&inode->i_mutex); + return offset; +} + +const struct file_operations ceph_file_fops = { + .open = ceph_open, + .release = ceph_release, + .llseek = ceph_llseek, + .read = do_sync_read, + .write = do_sync_write, + .aio_read = ceph_aio_read, + .aio_write = ceph_aio_write, + .mmap = ceph_mmap, + .fsync = ceph_fsync, + .splice_read = generic_file_splice_read, + .splice_write = generic_file_splice_write, + .unlocked_ioctl = ceph_ioctl, + .compat_ioctl = ceph_ioctl, +}; + diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c new file mode 100644 index 00000000000..7abe1aed819 --- /dev/null +++ b/fs/ceph/inode.c @@ -0,0 +1,1750 @@ +#include "ceph_debug.h" + +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/smp_lock.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/uaccess.h> +#include <linux/kernel.h> +#include <linux/namei.h> +#include <linux/writeback.h> +#include <linux/vmalloc.h> +#include <linux/pagevec.h> + +#include "super.h" +#include "decode.h" + +/* + * Ceph inode operations + * + * Implement basic inode helpers (get, alloc) and inode ops (getattr, + * setattr, etc.), xattr helpers, and helpers for assimilating + * metadata returned by the MDS into our cache. + * + * Also define helpers for doing asynchronous writeback, invalidation, + * and truncation for the benefit of those who can't afford to block + * (typically because they are in the message handler path). + */ + +static const struct inode_operations ceph_symlink_iops; + +static void ceph_invalidate_work(struct work_struct *work); +static void ceph_writeback_work(struct work_struct *work); +static void ceph_vmtruncate_work(struct work_struct *work); + +/* + * find or create an inode, given the ceph ino number + */ +struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino) +{ + struct inode *inode; + ino_t t = ceph_vino_to_ino(vino); + + inode = iget5_locked(sb, t, ceph_ino_compare, ceph_set_ino_cb, &vino); + if (inode == NULL) + return ERR_PTR(-ENOMEM); + if (inode->i_state & I_NEW) { + dout("get_inode created new inode %p %llx.%llx ino %llx\n", + inode, ceph_vinop(inode), (u64)inode->i_ino); + unlock_new_inode(inode); + } + + dout("get_inode on %lu=%llx.%llx got %p\n", inode->i_ino, vino.ino, + vino.snap, inode); + return inode; +} + +/* + * get/constuct snapdir inode for a given directory + */ +struct inode *ceph_get_snapdir(struct inode *parent) +{ + struct ceph_vino vino = { + .ino = ceph_ino(parent), + .snap = CEPH_SNAPDIR, + }; + struct inode *inode = ceph_get_inode(parent->i_sb, vino); + struct ceph_inode_info *ci = ceph_inode(inode); + + BUG_ON(!S_ISDIR(parent->i_mode)); + if (IS_ERR(inode)) + return ERR_PTR(PTR_ERR(inode)); + inode->i_mode = parent->i_mode; + inode->i_uid = parent->i_uid; + inode->i_gid = parent->i_gid; + inode->i_op = &ceph_dir_iops; + inode->i_fop = &ceph_dir_fops; + ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */ + ci->i_rbytes = 0; + return inode; +} + +const struct inode_operations ceph_file_iops = { + .permission = ceph_permission, + .setattr = ceph_setattr, + .getattr = ceph_getattr, + .setxattr = ceph_setxattr, + .getxattr = ceph_getxattr, + .listxattr = ceph_listxattr, + .removexattr = ceph_removexattr, +}; + + +/* + * We use a 'frag tree' to keep track of the MDS's directory fragments + * for a given inode (usually there is just a single fragment). We + * need to know when a child frag is delegated to a new MDS, or when + * it is flagged as replicated, so we can direct our requests + * accordingly. + */ + +/* + * find/create a frag in the tree + */ +static struct ceph_inode_frag *__get_or_create_frag(struct ceph_inode_info *ci, + u32 f) +{ + struct rb_node **p; + struct rb_node *parent = NULL; + struct ceph_inode_frag *frag; + int c; + + p = &ci->i_fragtree.rb_node; + while (*p) { + parent = *p; + frag = rb_entry(parent, struct ceph_inode_frag, node); + c = ceph_frag_compare(f, frag->frag); + if (c < 0) + p = &(*p)->rb_left; + else if (c > 0) + p = &(*p)->rb_right; + else + return frag; + } + + frag = kmalloc(sizeof(*frag), GFP_NOFS); + if (!frag) { + pr_err("__get_or_create_frag ENOMEM on %p %llx.%llx " + "frag %x\n", &ci->vfs_inode, + ceph_vinop(&ci->vfs_inode), f); + return ERR_PTR(-ENOMEM); + } + frag->frag = f; + frag->split_by = 0; + frag->mds = -1; + frag->ndist = 0; + + rb_link_node(&frag->node, parent, p); + rb_insert_color(&frag->node, &ci->i_fragtree); + + dout("get_or_create_frag added %llx.%llx frag %x\n", + ceph_vinop(&ci->vfs_inode), f); + return frag; +} + +/* + * find a specific frag @f + */ +struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, u32 f) +{ + struct rb_node *n = ci->i_fragtree.rb_node; + + while (n) { + struct ceph_inode_frag *frag = + rb_entry(n, struct ceph_inode_frag, node); + int c = ceph_frag_compare(f, frag->frag); + if (c < 0) + n = n->rb_left; + else if (c > 0) + n = n->rb_right; + else + return frag; + } + return NULL; +} + +/* + * Choose frag containing the given value @v. If @pfrag is + * specified, copy the frag delegation info to the caller if + * it is present. + */ +u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v, + struct ceph_inode_frag *pfrag, + int *found) +{ + u32 t = ceph_frag_make(0, 0); + struct ceph_inode_frag *frag; + unsigned nway, i; + u32 n; + + if (found) + *found = 0; + + mutex_lock(&ci->i_fragtree_mutex); + while (1) { + WARN_ON(!ceph_frag_contains_value(t, v)); + frag = __ceph_find_frag(ci, t); + if (!frag) + break; /* t is a leaf */ + if (frag->split_by == 0) { + if (pfrag) + memcpy(pfrag, frag, sizeof(*pfrag)); + if (found) + *found = 1; + break; + } + + /* choose child */ + nway = 1 << frag->split_by; + dout("choose_frag(%x) %x splits by %d (%d ways)\n", v, t, + frag->split_by, nway); + for (i = 0; i < nway; i++) { + n = ceph_frag_make_child(t, frag->split_by, i); + if (ceph_frag_contains_value(n, v)) { + t = n; + break; + } + } + BUG_ON(i == nway); + } + dout("choose_frag(%x) = %x\n", v, t); + + mutex_unlock(&ci->i_fragtree_mutex); + return t; +} + +/* + * Process dirfrag (delegation) info from the mds. Include leaf + * fragment in tree ONLY if ndist > 0. Otherwise, only + * branches/splits are included in i_fragtree) + */ +static int ceph_fill_dirfrag(struct inode *inode, + struct ceph_mds_reply_dirfrag *dirinfo) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_inode_frag *frag; + u32 id = le32_to_cpu(dirinfo->frag); + int mds = le32_to_cpu(dirinfo->auth); + int ndist = le32_to_cpu(dirinfo->ndist); + int i; + int err = 0; + + mutex_lock(&ci->i_fragtree_mutex); + if (ndist == 0) { + /* no delegation info needed. */ + frag = __ceph_find_frag(ci, id); + if (!frag) + goto out; + if (frag->split_by == 0) { + /* tree leaf, remove */ + dout("fill_dirfrag removed %llx.%llx frag %x" + " (no ref)\n", ceph_vinop(inode), id); + rb_erase(&frag->node, &ci->i_fragtree); + kfree(frag); + } else { + /* tree branch, keep and clear */ + dout("fill_dirfrag cleared %llx.%llx frag %x" + " referral\n", ceph_vinop(inode), id); + frag->mds = -1; + frag->ndist = 0; + } + goto out; + } + + + /* find/add this frag to store mds delegation info */ + frag = __get_or_create_frag(ci, id); + if (IS_ERR(frag)) { + /* this is not the end of the world; we can continue + with bad/inaccurate delegation info */ + pr_err("fill_dirfrag ENOMEM on mds ref %llx.%llx fg %x\n", + ceph_vinop(inode), le32_to_cpu(dirinfo->frag)); + err = -ENOMEM; + goto out; + } + + frag->mds = mds; + frag->ndist = min_t(u32, ndist, CEPH_MAX_DIRFRAG_REP); + for (i = 0; i < frag->ndist; i++) + frag->dist[i] = le32_to_cpu(dirinfo->dist[i]); + dout("fill_dirfrag %llx.%llx frag %x ndist=%d\n", + ceph_vinop(inode), frag->frag, frag->ndist); + +out: + mutex_unlock(&ci->i_fragtree_mutex); + return err; +} + + +/* + * initialize a newly allocated inode. + */ +struct inode *ceph_alloc_inode(struct super_block *sb) +{ + struct ceph_inode_info *ci; + int i; + + ci = kmem_cache_alloc(ceph_inode_cachep, GFP_NOFS); + if (!ci) + return NULL; + + dout("alloc_inode %p\n", &ci->vfs_inode); + + ci->i_version = 0; + ci->i_time_warp_seq = 0; + ci->i_ceph_flags = 0; + ci->i_release_count = 0; + ci->i_symlink = NULL; + + ci->i_fragtree = RB_ROOT; + mutex_init(&ci->i_fragtree_mutex); + + ci->i_xattrs.blob = NULL; + ci->i_xattrs.prealloc_blob = NULL; + ci->i_xattrs.dirty = false; + ci->i_xattrs.index = RB_ROOT; + ci->i_xattrs.count = 0; + ci->i_xattrs.names_size = 0; + ci->i_xattrs.vals_size = 0; + ci->i_xattrs.version = 0; + ci->i_xattrs.index_version = 0; + + ci->i_caps = RB_ROOT; + ci->i_auth_cap = NULL; + ci->i_dirty_caps = 0; + ci->i_flushing_caps = 0; + INIT_LIST_HEAD(&ci->i_dirty_item); + INIT_LIST_HEAD(&ci->i_flushing_item); + ci->i_cap_flush_seq = 0; + ci->i_cap_flush_last_tid = 0; + memset(&ci->i_cap_flush_tid, 0, sizeof(ci->i_cap_flush_tid)); + init_waitqueue_head(&ci->i_cap_wq); + ci->i_hold_caps_min = 0; + ci->i_hold_caps_max = 0; + INIT_LIST_HEAD(&ci->i_cap_delay_list); + ci->i_cap_exporting_mds = 0; + ci->i_cap_exporting_mseq = 0; + ci->i_cap_exporting_issued = 0; + INIT_LIST_HEAD(&ci->i_cap_snaps); + ci->i_head_snapc = NULL; + ci->i_snap_caps = 0; + + for (i = 0; i < CEPH_FILE_MODE_NUM; i++) + ci->i_nr_by_mode[i] = 0; + + ci->i_truncate_seq = 0; + ci->i_truncate_size = 0; + ci->i_truncate_pending = 0; + + ci->i_max_size = 0; + ci->i_reported_size = 0; + ci->i_wanted_max_size = 0; + ci->i_requested_max_size = 0; + + ci->i_pin_ref = 0; + ci->i_rd_ref = 0; + ci->i_rdcache_ref = 0; + ci->i_wr_ref = 0; + ci->i_wrbuffer_ref = 0; + ci->i_wrbuffer_ref_head = 0; + ci->i_shared_gen = 0; + ci->i_rdcache_gen = 0; + ci->i_rdcache_revoking = 0; + + INIT_LIST_HEAD(&ci->i_unsafe_writes); + INIT_LIST_HEAD(&ci->i_unsafe_dirops); + spin_lock_init(&ci->i_unsafe_lock); + + ci->i_snap_realm = NULL; + INIT_LIST_HEAD(&ci->i_snap_realm_item); + INIT_LIST_HEAD(&ci->i_snap_flush_item); + + INIT_WORK(&ci->i_wb_work, ceph_writeback_work); + INIT_WORK(&ci->i_pg_inv_work, ceph_invalidate_work); + + INIT_WORK(&ci->i_vmtruncate_work, ceph_vmtruncate_work); + + return &ci->vfs_inode; +} + +void ceph_destroy_inode(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_inode_frag *frag; + struct rb_node *n; + + dout("destroy_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode)); + + ceph_queue_caps_release(inode); + + kfree(ci->i_symlink); + while ((n = rb_first(&ci->i_fragtree)) != NULL) { + frag = rb_entry(n, struct ceph_inode_frag, node); + rb_erase(n, &ci->i_fragtree); + kfree(frag); + } + + __ceph_destroy_xattrs(ci); + if (ci->i_xattrs.blob) + ceph_buffer_put(ci->i_xattrs.blob); + if (ci->i_xattrs.prealloc_blob) + ceph_buffer_put(ci->i_xattrs.prealloc_blob); + + kmem_cache_free(ceph_inode_cachep, ci); +} + + +/* + * Helpers to fill in size, ctime, mtime, and atime. We have to be + * careful because either the client or MDS may have more up to date + * info, depending on which capabilities are held, and whether + * time_warp_seq or truncate_seq have increased. (Ordinarily, mtime + * and size are monotonically increasing, except when utimes() or + * truncate() increments the corresponding _seq values.) + */ +int ceph_fill_file_size(struct inode *inode, int issued, + u32 truncate_seq, u64 truncate_size, u64 size) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int queue_trunc = 0; + + if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 || + (truncate_seq == ci->i_truncate_seq && size > inode->i_size)) { + dout("size %lld -> %llu\n", inode->i_size, size); + inode->i_size = size; + inode->i_blocks = (size + (1<<9) - 1) >> 9; + ci->i_reported_size = size; + if (truncate_seq != ci->i_truncate_seq) { + dout("truncate_seq %u -> %u\n", + ci->i_truncate_seq, truncate_seq); + ci->i_truncate_seq = truncate_seq; + /* + * If we hold relevant caps, or in the case where we're + * not the only client referencing this file and we + * don't hold those caps, then we need to check whether + * the file is either opened or mmaped + */ + if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_RD| + CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER| + CEPH_CAP_FILE_EXCL)) || + mapping_mapped(inode->i_mapping) || + __ceph_caps_file_wanted(ci)) { + ci->i_truncate_pending++; + queue_trunc = 1; + } + } + } + if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) >= 0 && + ci->i_truncate_size != truncate_size) { + dout("truncate_size %lld -> %llu\n", ci->i_truncate_size, + truncate_size); + ci->i_truncate_size = truncate_size; + } + return queue_trunc; +} + +void ceph_fill_file_time(struct inode *inode, int issued, + u64 time_warp_seq, struct timespec *ctime, + struct timespec *mtime, struct timespec *atime) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int warn = 0; + + if (issued & (CEPH_CAP_FILE_EXCL| + CEPH_CAP_FILE_WR| + CEPH_CAP_FILE_BUFFER)) { + if (timespec_compare(ctime, &inode->i_ctime) > 0) { + dout("ctime %ld.%09ld -> %ld.%09ld inc w/ cap\n", + inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec, + ctime->tv_sec, ctime->tv_nsec); + inode->i_ctime = *ctime; + } + if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) { + /* the MDS did a utimes() */ + dout("mtime %ld.%09ld -> %ld.%09ld " + "tw %d -> %d\n", + inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec, + mtime->tv_sec, mtime->tv_nsec, + ci->i_time_warp_seq, (int)time_warp_seq); + + inode->i_mtime = *mtime; + inode->i_atime = *atime; + ci->i_time_warp_seq = time_warp_seq; + } else if (time_warp_seq == ci->i_time_warp_seq) { + /* nobody did utimes(); take the max */ + if (timespec_compare(mtime, &inode->i_mtime) > 0) { + dout("mtime %ld.%09ld -> %ld.%09ld inc\n", + inode->i_mtime.tv_sec, + inode->i_mtime.tv_nsec, + mtime->tv_sec, mtime->tv_nsec); + inode->i_mtime = *mtime; + } + if (timespec_compare(atime, &inode->i_atime) > 0) { + dout("atime %ld.%09ld -> %ld.%09ld inc\n", + inode->i_atime.tv_sec, + inode->i_atime.tv_nsec, + atime->tv_sec, atime->tv_nsec); + inode->i_atime = *atime; + } + } else if (issued & CEPH_CAP_FILE_EXCL) { + /* we did a utimes(); ignore mds values */ + } else { + warn = 1; + } + } else { + /* we have no write caps; whatever the MDS says is true */ + if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) { + inode->i_ctime = *ctime; + inode->i_mtime = *mtime; + inode->i_atime = *atime; + ci->i_time_warp_seq = time_warp_seq; + } else { + warn = 1; + } + } + if (warn) /* time_warp_seq shouldn't go backwards */ + dout("%p mds time_warp_seq %llu < %u\n", + inode, time_warp_seq, ci->i_time_warp_seq); +} + +/* + * Populate an inode based on info from mds. May be called on new or + * existing inodes. + */ +static int fill_inode(struct inode *inode, + struct ceph_mds_reply_info_in *iinfo, + struct ceph_mds_reply_dirfrag *dirinfo, + struct ceph_mds_session *session, + unsigned long ttl_from, int cap_fmode, + struct ceph_cap_reservation *caps_reservation) +{ + struct ceph_mds_reply_inode *info = iinfo->in; + struct ceph_inode_info *ci = ceph_inode(inode); + int i; + int issued, implemented; + struct timespec mtime, atime, ctime; + u32 nsplits; + struct ceph_buffer *xattr_blob = NULL; + int err = 0; + int queue_trunc = 0; + + dout("fill_inode %p ino %llx.%llx v %llu had %llu\n", + inode, ceph_vinop(inode), le64_to_cpu(info->version), + ci->i_version); + + /* + * prealloc xattr data, if it looks like we'll need it. only + * if len > 4 (meaning there are actually xattrs; the first 4 + * bytes are the xattr count). + */ + if (iinfo->xattr_len > 4) { + xattr_blob = ceph_buffer_new(iinfo->xattr_len, GFP_NOFS); + if (!xattr_blob) + pr_err("fill_inode ENOMEM xattr blob %d bytes\n", + iinfo->xattr_len); + } + + spin_lock(&inode->i_lock); + + /* + * provided version will be odd if inode value is projected, + * even if stable. skip the update if we have a newer info + * (e.g., due to inode info racing form multiple MDSs), or if + * we are getting projected (unstable) inode info. + */ + if (le64_to_cpu(info->version) > 0 && + (ci->i_version & ~1) > le64_to_cpu(info->version)) + goto no_change; + + issued = __ceph_caps_issued(ci, &implemented); + issued |= implemented | __ceph_caps_dirty(ci); + + /* update inode */ + ci->i_version = le64_to_cpu(info->version); + inode->i_version++; + inode->i_rdev = le32_to_cpu(info->rdev); + + if ((issued & CEPH_CAP_AUTH_EXCL) == 0) { + inode->i_mode = le32_to_cpu(info->mode); + inode->i_uid = le32_to_cpu(info->uid); + inode->i_gid = le32_to_cpu(info->gid); + dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode, + inode->i_uid, inode->i_gid); + } + + if ((issued & CEPH_CAP_LINK_EXCL) == 0) + inode->i_nlink = le32_to_cpu(info->nlink); + + /* be careful with mtime, atime, size */ + ceph_decode_timespec(&atime, &info->atime); + ceph_decode_timespec(&mtime, &info->mtime); + ceph_decode_timespec(&ctime, &info->ctime); + queue_trunc = ceph_fill_file_size(inode, issued, + le32_to_cpu(info->truncate_seq), + le64_to_cpu(info->truncate_size), + le64_to_cpu(info->size)); + ceph_fill_file_time(inode, issued, + le32_to_cpu(info->time_warp_seq), + &ctime, &mtime, &atime); + + ci->i_max_size = le64_to_cpu(info->max_size); + ci->i_layout = info->layout; + inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; + + /* xattrs */ + /* note that if i_xattrs.len <= 4, i_xattrs.data will still be NULL. */ + if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && + le64_to_cpu(info->xattr_version) > ci->i_xattrs.version) { + if (ci->i_xattrs.blob) + ceph_buffer_put(ci->i_xattrs.blob); + ci->i_xattrs.blob = xattr_blob; + if (xattr_blob) + memcpy(ci->i_xattrs.blob->vec.iov_base, + iinfo->xattr_data, iinfo->xattr_len); + ci->i_xattrs.version = le64_to_cpu(info->xattr_version); + } + + inode->i_mapping->a_ops = &ceph_aops; + inode->i_mapping->backing_dev_info = + &ceph_client(inode->i_sb)->backing_dev_info; + + switch (inode->i_mode & S_IFMT) { + case S_IFIFO: + case S_IFBLK: + case S_IFCHR: + case S_IFSOCK: + init_special_inode(inode, inode->i_mode, inode->i_rdev); + inode->i_op = &ceph_file_iops; + break; + case S_IFREG: + inode->i_op = &ceph_file_iops; + inode->i_fop = &ceph_file_fops; + break; + case S_IFLNK: + inode->i_op = &ceph_symlink_iops; + if (!ci->i_symlink) { + int symlen = iinfo->symlink_len; + char *sym; + + BUG_ON(symlen != inode->i_size); + spin_unlock(&inode->i_lock); + + err = -ENOMEM; + sym = kmalloc(symlen+1, GFP_NOFS); + if (!sym) + goto out; + memcpy(sym, iinfo->symlink, symlen); + sym[symlen] = 0; + + spin_lock(&inode->i_lock); + if (!ci->i_symlink) + ci->i_symlink = sym; + else + kfree(sym); /* lost a race */ + } + break; + case S_IFDIR: + inode->i_op = &ceph_dir_iops; + inode->i_fop = &ceph_dir_fops; + + ci->i_files = le64_to_cpu(info->files); + ci->i_subdirs = le64_to_cpu(info->subdirs); + ci->i_rbytes = le64_to_cpu(info->rbytes); + ci->i_rfiles = le64_to_cpu(info->rfiles); + ci->i_rsubdirs = le64_to_cpu(info->rsubdirs); + ceph_decode_timespec(&ci->i_rctime, &info->rctime); + + /* set dir completion flag? */ + if (ci->i_files == 0 && ci->i_subdirs == 0 && + ceph_snap(inode) == CEPH_NOSNAP && + (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED)) { + dout(" marking %p complete (empty)\n", inode); + ci->i_ceph_flags |= CEPH_I_COMPLETE; + ci->i_max_offset = 2; + } + + /* it may be better to set st_size in getattr instead? */ + if (ceph_test_opt(ceph_client(inode->i_sb), RBYTES)) + inode->i_size = ci->i_rbytes; + break; + default: + pr_err("fill_inode %llx.%llx BAD mode 0%o\n", + ceph_vinop(inode), inode->i_mode); + } + +no_change: + spin_unlock(&inode->i_lock); + + /* queue truncate if we saw i_size decrease */ + if (queue_trunc) + ceph_queue_vmtruncate(inode); + + /* populate frag tree */ + /* FIXME: move me up, if/when version reflects fragtree changes */ + nsplits = le32_to_cpu(info->fragtree.nsplits); + mutex_lock(&ci->i_fragtree_mutex); + for (i = 0; i < nsplits; i++) { + u32 id = le32_to_cpu(info->fragtree.splits[i].frag); + struct ceph_inode_frag *frag = __get_or_create_frag(ci, id); + + if (IS_ERR(frag)) + continue; + frag->split_by = le32_to_cpu(info->fragtree.splits[i].by); + dout(" frag %x split by %d\n", frag->frag, frag->split_by); + } + mutex_unlock(&ci->i_fragtree_mutex); + + /* were we issued a capability? */ + if (info->cap.caps) { + if (ceph_snap(inode) == CEPH_NOSNAP) { + ceph_add_cap(inode, session, + le64_to_cpu(info->cap.cap_id), + cap_fmode, + le32_to_cpu(info->cap.caps), + le32_to_cpu(info->cap.wanted), + le32_to_cpu(info->cap.seq), + le32_to_cpu(info->cap.mseq), + le64_to_cpu(info->cap.realm), + info->cap.flags, + caps_reservation); + } else { + spin_lock(&inode->i_lock); + dout(" %p got snap_caps %s\n", inode, + ceph_cap_string(le32_to_cpu(info->cap.caps))); + ci->i_snap_caps |= le32_to_cpu(info->cap.caps); + if (cap_fmode >= 0) + __ceph_get_fmode(ci, cap_fmode); + spin_unlock(&inode->i_lock); + } + } + + /* update delegation info? */ + if (dirinfo) + ceph_fill_dirfrag(inode, dirinfo); + + err = 0; + +out: + if (xattr_blob) + ceph_buffer_put(xattr_blob); + return err; +} + +/* + * caller should hold session s_mutex. + */ +static void update_dentry_lease(struct dentry *dentry, + struct ceph_mds_reply_lease *lease, + struct ceph_mds_session *session, + unsigned long from_time) +{ + struct ceph_dentry_info *di = ceph_dentry(dentry); + long unsigned duration = le32_to_cpu(lease->duration_ms); + long unsigned ttl = from_time + (duration * HZ) / 1000; + long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000; + struct inode *dir; + + /* only track leases on regular dentries */ + if (dentry->d_op != &ceph_dentry_ops) + return; + + spin_lock(&dentry->d_lock); + dout("update_dentry_lease %p mask %d duration %lu ms ttl %lu\n", + dentry, le16_to_cpu(lease->mask), duration, ttl); + + /* make lease_rdcache_gen match directory */ + dir = dentry->d_parent->d_inode; + di->lease_shared_gen = ceph_inode(dir)->i_shared_gen; + + if (lease->mask == 0) + goto out_unlock; + + if (di->lease_gen == session->s_cap_gen && + time_before(ttl, dentry->d_time)) + goto out_unlock; /* we already have a newer lease. */ + + if (di->lease_session && di->lease_session != session) + goto out_unlock; + + ceph_dentry_lru_touch(dentry); + + if (!di->lease_session) + di->lease_session = ceph_get_mds_session(session); + di->lease_gen = session->s_cap_gen; + di->lease_seq = le32_to_cpu(lease->seq); + di->lease_renew_after = half_ttl; + di->lease_renew_from = 0; + dentry->d_time = ttl; +out_unlock: + spin_unlock(&dentry->d_lock); + return; +} + +/* + * splice a dentry to an inode. + * caller must hold directory i_mutex for this to be safe. + * + * we will only rehash the resulting dentry if @prehash is + * true; @prehash will be set to false (for the benefit of + * the caller) if we fail. + */ +static struct dentry *splice_dentry(struct dentry *dn, struct inode *in, + bool *prehash) +{ + struct dentry *realdn; + + /* dn must be unhashed */ + if (!d_unhashed(dn)) + d_drop(dn); + realdn = d_materialise_unique(dn, in); + if (IS_ERR(realdn)) { + pr_err("splice_dentry error %p inode %p ino %llx.%llx\n", + dn, in, ceph_vinop(in)); + if (prehash) + *prehash = false; /* don't rehash on error */ + dn = realdn; /* note realdn contains the error */ + goto out; + } else if (realdn) { + dout("dn %p (%d) spliced with %p (%d) " + "inode %p ino %llx.%llx\n", + dn, atomic_read(&dn->d_count), + realdn, atomic_read(&realdn->d_count), + realdn->d_inode, ceph_vinop(realdn->d_inode)); + dput(dn); + dn = realdn; + } else { + BUG_ON(!ceph_dentry(dn)); + + dout("dn %p attached to %p ino %llx.%llx\n", + dn, dn->d_inode, ceph_vinop(dn->d_inode)); + } + if ((!prehash || *prehash) && d_unhashed(dn)) + d_rehash(dn); +out: + return dn; +} + +/* + * Set dentry's directory position based on the current dir's max, and + * order it in d_subdirs, so that dcache_readdir behaves. + */ +static void ceph_set_dentry_offset(struct dentry *dn) +{ + struct dentry *dir = dn->d_parent; + struct inode *inode = dn->d_parent->d_inode; + struct ceph_dentry_info *di; + + BUG_ON(!inode); + + di = ceph_dentry(dn); + + spin_lock(&inode->i_lock); + di->offset = ceph_inode(inode)->i_max_offset++; + spin_unlock(&inode->i_lock); + + spin_lock(&dcache_lock); + spin_lock(&dn->d_lock); + list_move_tail(&dir->d_subdirs, &dn->d_u.d_child); + dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset, + dn->d_u.d_child.prev, dn->d_u.d_child.next); + spin_unlock(&dn->d_lock); + spin_unlock(&dcache_lock); +} + +/* + * Incorporate results into the local cache. This is either just + * one inode, or a directory, dentry, and possibly linked-to inode (e.g., + * after a lookup). + * + * A reply may contain + * a directory inode along with a dentry. + * and/or a target inode + * + * Called with snap_rwsem (read). + */ +int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, + struct ceph_mds_session *session) +{ + struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; + struct inode *in = NULL; + struct ceph_mds_reply_inode *ininfo; + struct ceph_vino vino; + int i = 0; + int err = 0; + + dout("fill_trace %p is_dentry %d is_target %d\n", req, + rinfo->head->is_dentry, rinfo->head->is_target); + +#if 0 + /* + * Debugging hook: + * + * If we resend completed ops to a recovering mds, we get no + * trace. Since that is very rare, pretend this is the case + * to ensure the 'no trace' handlers in the callers behave. + * + * Fill in inodes unconditionally to avoid breaking cap + * invariants. + */ + if (rinfo->head->op & CEPH_MDS_OP_WRITE) { + pr_info("fill_trace faking empty trace on %lld %s\n", + req->r_tid, ceph_mds_op_name(rinfo->head->op)); + if (rinfo->head->is_dentry) { + rinfo->head->is_dentry = 0; + err = fill_inode(req->r_locked_dir, + &rinfo->diri, rinfo->dirfrag, + session, req->r_request_started, -1); + } + if (rinfo->head->is_target) { + rinfo->head->is_target = 0; + ininfo = rinfo->targeti.in; + vino.ino = le64_to_cpu(ininfo->ino); + vino.snap = le64_to_cpu(ininfo->snapid); + in = ceph_get_inode(sb, vino); + err = fill_inode(in, &rinfo->targeti, NULL, + session, req->r_request_started, + req->r_fmode); + iput(in); + } + } +#endif + + if (!rinfo->head->is_target && !rinfo->head->is_dentry) { + dout("fill_trace reply is empty!\n"); + if (rinfo->head->result == 0 && req->r_locked_dir) { + struct ceph_inode_info *ci = + ceph_inode(req->r_locked_dir); + dout(" clearing %p complete (empty trace)\n", + req->r_locked_dir); + ci->i_ceph_flags &= ~CEPH_I_COMPLETE; + ci->i_release_count++; + } + return 0; + } + + if (rinfo->head->is_dentry) { + struct inode *dir = req->r_locked_dir; + + err = fill_inode(dir, &rinfo->diri, rinfo->dirfrag, + session, req->r_request_started, -1, + &req->r_caps_reservation); + if (err < 0) + return err; + } + + if (rinfo->head->is_dentry && !req->r_aborted) { + /* + * lookup link rename : null -> possibly existing inode + * mknod symlink mkdir : null -> new inode + * unlink : linked -> null + */ + struct inode *dir = req->r_locked_dir; + struct dentry *dn = req->r_dentry; + bool have_dir_cap, have_lease; + + BUG_ON(!dn); + BUG_ON(!dir); + BUG_ON(dn->d_parent->d_inode != dir); + BUG_ON(ceph_ino(dir) != + le64_to_cpu(rinfo->diri.in->ino)); + BUG_ON(ceph_snap(dir) != + le64_to_cpu(rinfo->diri.in->snapid)); + + /* do we have a lease on the whole dir? */ + have_dir_cap = + (le32_to_cpu(rinfo->diri.in->cap.caps) & + CEPH_CAP_FILE_SHARED); + + /* do we have a dn lease? */ + have_lease = have_dir_cap || + (le16_to_cpu(rinfo->dlease->mask) & + CEPH_LOCK_DN); + + if (!have_lease) + dout("fill_trace no dentry lease or dir cap\n"); + + /* rename? */ + if (req->r_old_dentry && req->r_op == CEPH_MDS_OP_RENAME) { + dout(" src %p '%.*s' dst %p '%.*s'\n", + req->r_old_dentry, + req->r_old_dentry->d_name.len, + req->r_old_dentry->d_name.name, + dn, dn->d_name.len, dn->d_name.name); + dout("fill_trace doing d_move %p -> %p\n", + req->r_old_dentry, dn); + d_move(req->r_old_dentry, dn); + dout(" src %p '%.*s' dst %p '%.*s'\n", + req->r_old_dentry, + req->r_old_dentry->d_name.len, + req->r_old_dentry->d_name.name, + dn, dn->d_name.len, dn->d_name.name); + /* ensure target dentry is invalidated, despite + rehashing bug in vfs_rename_dir */ + dn->d_time = jiffies; + ceph_dentry(dn)->lease_shared_gen = 0; + /* take overwritten dentry's readdir offset */ + ceph_dentry(req->r_old_dentry)->offset = + ceph_dentry(dn)->offset; + dn = req->r_old_dentry; /* use old_dentry */ + in = dn->d_inode; + } + + /* null dentry? */ + if (!rinfo->head->is_target) { + dout("fill_trace null dentry\n"); + if (dn->d_inode) { + dout("d_delete %p\n", dn); + d_delete(dn); + } else { + dout("d_instantiate %p NULL\n", dn); + d_instantiate(dn, NULL); + if (have_lease && d_unhashed(dn)) + d_rehash(dn); + update_dentry_lease(dn, rinfo->dlease, + session, + req->r_request_started); + } + goto done; + } + + /* attach proper inode */ + ininfo = rinfo->targeti.in; + vino.ino = le64_to_cpu(ininfo->ino); + vino.snap = le64_to_cpu(ininfo->snapid); + if (!dn->d_inode) { + in = ceph_get_inode(sb, vino); + if (IS_ERR(in)) { + pr_err("fill_trace bad get_inode " + "%llx.%llx\n", vino.ino, vino.snap); + err = PTR_ERR(in); + d_delete(dn); + goto done; + } + dn = splice_dentry(dn, in, &have_lease); + if (IS_ERR(dn)) { + err = PTR_ERR(dn); + goto done; + } + req->r_dentry = dn; /* may have spliced */ + ceph_set_dentry_offset(dn); + igrab(in); + } else if (ceph_ino(in) == vino.ino && + ceph_snap(in) == vino.snap) { + igrab(in); + } else { + dout(" %p links to %p %llx.%llx, not %llx.%llx\n", + dn, in, ceph_ino(in), ceph_snap(in), + vino.ino, vino.snap); + have_lease = false; + in = NULL; + } + + if (have_lease) + update_dentry_lease(dn, rinfo->dlease, session, + req->r_request_started); + dout(" final dn %p\n", dn); + i++; + } else if (req->r_op == CEPH_MDS_OP_LOOKUPSNAP || + req->r_op == CEPH_MDS_OP_MKSNAP) { + struct dentry *dn = req->r_dentry; + + /* fill out a snapdir LOOKUPSNAP dentry */ + BUG_ON(!dn); + BUG_ON(!req->r_locked_dir); + BUG_ON(ceph_snap(req->r_locked_dir) != CEPH_SNAPDIR); + ininfo = rinfo->targeti.in; + vino.ino = le64_to_cpu(ininfo->ino); + vino.snap = le64_to_cpu(ininfo->snapid); + in = ceph_get_inode(sb, vino); + if (IS_ERR(in)) { + pr_err("fill_inode get_inode badness %llx.%llx\n", + vino.ino, vino.snap); + err = PTR_ERR(in); + d_delete(dn); + goto done; + } + dout(" linking snapped dir %p to dn %p\n", in, dn); + dn = splice_dentry(dn, in, NULL); + if (IS_ERR(dn)) { + err = PTR_ERR(dn); + goto done; + } + ceph_set_dentry_offset(dn); + req->r_dentry = dn; /* may have spliced */ + igrab(in); + rinfo->head->is_dentry = 1; /* fool notrace handlers */ + } + + if (rinfo->head->is_target) { + vino.ino = le64_to_cpu(rinfo->targeti.in->ino); + vino.snap = le64_to_cpu(rinfo->targeti.in->snapid); + + if (in == NULL || ceph_ino(in) != vino.ino || + ceph_snap(in) != vino.snap) { + in = ceph_get_inode(sb, vino); + if (IS_ERR(in)) { + err = PTR_ERR(in); + goto done; + } + } + req->r_target_inode = in; + + err = fill_inode(in, + &rinfo->targeti, NULL, + session, req->r_request_started, + (le32_to_cpu(rinfo->head->result) == 0) ? + req->r_fmode : -1, + &req->r_caps_reservation); + if (err < 0) { + pr_err("fill_inode badness %p %llx.%llx\n", + in, ceph_vinop(in)); + goto done; + } + } + +done: + dout("fill_trace done err=%d\n", err); + return err; +} + +/* + * Prepopulate our cache with readdir results, leases, etc. + */ +int ceph_readdir_prepopulate(struct ceph_mds_request *req, + struct ceph_mds_session *session) +{ + struct dentry *parent = req->r_dentry; + struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; + struct qstr dname; + struct dentry *dn; + struct inode *in; + int err = 0, i; + struct inode *snapdir = NULL; + struct ceph_mds_request_head *rhead = req->r_request->front.iov_base; + u64 frag = le32_to_cpu(rhead->args.readdir.frag); + struct ceph_dentry_info *di; + + if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) { + snapdir = ceph_get_snapdir(parent->d_inode); + parent = d_find_alias(snapdir); + dout("readdir_prepopulate %d items under SNAPDIR dn %p\n", + rinfo->dir_nr, parent); + } else { + dout("readdir_prepopulate %d items under dn %p\n", + rinfo->dir_nr, parent); + if (rinfo->dir_dir) + ceph_fill_dirfrag(parent->d_inode, rinfo->dir_dir); + } + + for (i = 0; i < rinfo->dir_nr; i++) { + struct ceph_vino vino; + + dname.name = rinfo->dir_dname[i]; + dname.len = rinfo->dir_dname_len[i]; + dname.hash = full_name_hash(dname.name, dname.len); + + vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino); + vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid); + +retry_lookup: + dn = d_lookup(parent, &dname); + dout("d_lookup on parent=%p name=%.*s got %p\n", + parent, dname.len, dname.name, dn); + + if (!dn) { + dn = d_alloc(parent, &dname); + dout("d_alloc %p '%.*s' = %p\n", parent, + dname.len, dname.name, dn); + if (dn == NULL) { + dout("d_alloc badness\n"); + err = -ENOMEM; + goto out; + } + err = ceph_init_dentry(dn); + if (err < 0) + goto out; + } else if (dn->d_inode && + (ceph_ino(dn->d_inode) != vino.ino || + ceph_snap(dn->d_inode) != vino.snap)) { + dout(" dn %p points to wrong inode %p\n", + dn, dn->d_inode); + d_delete(dn); + dput(dn); + goto retry_lookup; + } else { + /* reorder parent's d_subdirs */ + spin_lock(&dcache_lock); + spin_lock(&dn->d_lock); + list_move(&dn->d_u.d_child, &parent->d_subdirs); + spin_unlock(&dn->d_lock); + spin_unlock(&dcache_lock); + } + + di = dn->d_fsdata; + di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset); + + /* inode */ + if (dn->d_inode) { + in = dn->d_inode; + } else { + in = ceph_get_inode(parent->d_sb, vino); + if (in == NULL) { + dout("new_inode badness\n"); + d_delete(dn); + dput(dn); + err = -ENOMEM; + goto out; + } + dn = splice_dentry(dn, in, NULL); + } + + if (fill_inode(in, &rinfo->dir_in[i], NULL, session, + req->r_request_started, -1, + &req->r_caps_reservation) < 0) { + pr_err("fill_inode badness on %p\n", in); + dput(dn); + continue; + } + update_dentry_lease(dn, rinfo->dir_dlease[i], + req->r_session, req->r_request_started); + dput(dn); + } + req->r_did_prepopulate = true; + +out: + if (snapdir) { + iput(snapdir); + dput(parent); + } + dout("readdir_prepopulate done\n"); + return err; +} + +int ceph_inode_set_size(struct inode *inode, loff_t size) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int ret = 0; + + spin_lock(&inode->i_lock); + dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size); + inode->i_size = size; + inode->i_blocks = (size + (1 << 9) - 1) >> 9; + + /* tell the MDS if we are approaching max_size */ + if ((size << 1) >= ci->i_max_size && + (ci->i_reported_size << 1) < ci->i_max_size) + ret = 1; + + spin_unlock(&inode->i_lock); + return ret; +} + +/* + * Write back inode data in a worker thread. (This can't be done + * in the message handler context.) + */ +void ceph_queue_writeback(struct inode *inode) +{ + if (queue_work(ceph_inode_to_client(inode)->wb_wq, + &ceph_inode(inode)->i_wb_work)) { + dout("ceph_queue_writeback %p\n", inode); + igrab(inode); + } else { + dout("ceph_queue_writeback %p failed\n", inode); + } +} + +static void ceph_writeback_work(struct work_struct *work) +{ + struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info, + i_wb_work); + struct inode *inode = &ci->vfs_inode; + + dout("writeback %p\n", inode); + filemap_fdatawrite(&inode->i_data); + iput(inode); +} + +/* + * queue an async invalidation + */ +void ceph_queue_invalidate(struct inode *inode) +{ + if (queue_work(ceph_inode_to_client(inode)->pg_inv_wq, + &ceph_inode(inode)->i_pg_inv_work)) { + dout("ceph_queue_invalidate %p\n", inode); + igrab(inode); + } else { + dout("ceph_queue_invalidate %p failed\n", inode); + } +} + +/* + * invalidate any pages that are not dirty or under writeback. this + * includes pages that are clean and mapped. + */ +static void ceph_invalidate_nondirty_pages(struct address_space *mapping) +{ + struct pagevec pvec; + pgoff_t next = 0; + int i; + + pagevec_init(&pvec, 0); + while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i]; + pgoff_t index; + int skip_page = + (PageDirty(page) || PageWriteback(page)); + + if (!skip_page) + skip_page = !trylock_page(page); + + /* + * We really shouldn't be looking at the ->index of an + * unlocked page. But we're not allowed to lock these + * pages. So we rely upon nobody altering the ->index + * of this (pinned-by-us) page. + */ + index = page->index; + if (index > next) + next = index; + next++; + + if (skip_page) + continue; + + generic_error_remove_page(mapping, page); + unlock_page(page); + } + pagevec_release(&pvec); + cond_resched(); + } +} + +/* + * Invalidate inode pages in a worker thread. (This can't be done + * in the message handler context.) + */ +static void ceph_invalidate_work(struct work_struct *work) +{ + struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info, + i_pg_inv_work); + struct inode *inode = &ci->vfs_inode; + u32 orig_gen; + int check = 0; + + spin_lock(&inode->i_lock); + dout("invalidate_pages %p gen %d revoking %d\n", inode, + ci->i_rdcache_gen, ci->i_rdcache_revoking); + if (ci->i_rdcache_gen == 0 || + ci->i_rdcache_revoking != ci->i_rdcache_gen) { + BUG_ON(ci->i_rdcache_revoking > ci->i_rdcache_gen); + /* nevermind! */ + ci->i_rdcache_revoking = 0; + spin_unlock(&inode->i_lock); + goto out; + } + orig_gen = ci->i_rdcache_gen; + spin_unlock(&inode->i_lock); + + ceph_invalidate_nondirty_pages(inode->i_mapping); + + spin_lock(&inode->i_lock); + if (orig_gen == ci->i_rdcache_gen) { + dout("invalidate_pages %p gen %d successful\n", inode, + ci->i_rdcache_gen); + ci->i_rdcache_gen = 0; + ci->i_rdcache_revoking = 0; + check = 1; + } else { + dout("invalidate_pages %p gen %d raced, gen now %d\n", + inode, orig_gen, ci->i_rdcache_gen); + } + spin_unlock(&inode->i_lock); + + if (check) + ceph_check_caps(ci, 0, NULL); +out: + iput(inode); +} + + +/* + * called by trunc_wq; take i_mutex ourselves + * + * We also truncate in a separate thread as well. + */ +static void ceph_vmtruncate_work(struct work_struct *work) +{ + struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info, + i_vmtruncate_work); + struct inode *inode = &ci->vfs_inode; + + dout("vmtruncate_work %p\n", inode); + mutex_lock(&inode->i_mutex); + __ceph_do_pending_vmtruncate(inode); + mutex_unlock(&inode->i_mutex); + iput(inode); +} + +/* + * Queue an async vmtruncate. If we fail to queue work, we will handle + * the truncation the next time we call __ceph_do_pending_vmtruncate. + */ +void ceph_queue_vmtruncate(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + if (queue_work(ceph_client(inode->i_sb)->trunc_wq, + &ci->i_vmtruncate_work)) { + dout("ceph_queue_vmtruncate %p\n", inode); + igrab(inode); + } else { + dout("ceph_queue_vmtruncate %p failed, pending=%d\n", + inode, ci->i_truncate_pending); + } +} + +/* + * called with i_mutex held. + * + * Make sure any pending truncation is applied before doing anything + * that may depend on it. + */ +void __ceph_do_pending_vmtruncate(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + u64 to; + int wrbuffer_refs, wake = 0; + +retry: + spin_lock(&inode->i_lock); + if (ci->i_truncate_pending == 0) { + dout("__do_pending_vmtruncate %p none pending\n", inode); + spin_unlock(&inode->i_lock); + return; + } + + /* + * make sure any dirty snapped pages are flushed before we + * possibly truncate them.. so write AND block! + */ + if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) { + dout("__do_pending_vmtruncate %p flushing snaps first\n", + inode); + spin_unlock(&inode->i_lock); + filemap_write_and_wait_range(&inode->i_data, 0, + inode->i_sb->s_maxbytes); + goto retry; + } + + to = ci->i_truncate_size; + wrbuffer_refs = ci->i_wrbuffer_ref; + dout("__do_pending_vmtruncate %p (%d) to %lld\n", inode, + ci->i_truncate_pending, to); + spin_unlock(&inode->i_lock); + + truncate_inode_pages(inode->i_mapping, to); + + spin_lock(&inode->i_lock); + ci->i_truncate_pending--; + if (ci->i_truncate_pending == 0) + wake = 1; + spin_unlock(&inode->i_lock); + + if (wrbuffer_refs == 0) + ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); + if (wake) + wake_up(&ci->i_cap_wq); +} + + +/* + * symlinks + */ +static void *ceph_sym_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct ceph_inode_info *ci = ceph_inode(dentry->d_inode); + nd_set_link(nd, ci->i_symlink); + return NULL; +} + +static const struct inode_operations ceph_symlink_iops = { + .readlink = generic_readlink, + .follow_link = ceph_sym_follow_link, +}; + +/* + * setattr + */ +int ceph_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = dentry->d_inode; + struct ceph_inode_info *ci = ceph_inode(inode); + struct inode *parent_inode = dentry->d_parent->d_inode; + const unsigned int ia_valid = attr->ia_valid; + struct ceph_mds_request *req; + struct ceph_mds_client *mdsc = &ceph_client(dentry->d_sb)->mdsc; + int issued; + int release = 0, dirtied = 0; + int mask = 0; + int err = 0; + + if (ceph_snap(inode) != CEPH_NOSNAP) + return -EROFS; + + __ceph_do_pending_vmtruncate(inode); + + err = inode_change_ok(inode, attr); + if (err != 0) + return err; + + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETATTR, + USE_AUTH_MDS); + if (IS_ERR(req)) + return PTR_ERR(req); + + spin_lock(&inode->i_lock); + issued = __ceph_caps_issued(ci, NULL); + dout("setattr %p issued %s\n", inode, ceph_cap_string(issued)); + + if (ia_valid & ATTR_UID) { + dout("setattr %p uid %d -> %d\n", inode, + inode->i_uid, attr->ia_uid); + if (issued & CEPH_CAP_AUTH_EXCL) { + inode->i_uid = attr->ia_uid; + dirtied |= CEPH_CAP_AUTH_EXCL; + } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 || + attr->ia_uid != inode->i_uid) { + req->r_args.setattr.uid = cpu_to_le32(attr->ia_uid); + mask |= CEPH_SETATTR_UID; + release |= CEPH_CAP_AUTH_SHARED; + } + } + if (ia_valid & ATTR_GID) { + dout("setattr %p gid %d -> %d\n", inode, + inode->i_gid, attr->ia_gid); + if (issued & CEPH_CAP_AUTH_EXCL) { + inode->i_gid = attr->ia_gid; + dirtied |= CEPH_CAP_AUTH_EXCL; + } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 || + attr->ia_gid != inode->i_gid) { + req->r_args.setattr.gid = cpu_to_le32(attr->ia_gid); + mask |= CEPH_SETATTR_GID; + release |= CEPH_CAP_AUTH_SHARED; + } + } + if (ia_valid & ATTR_MODE) { + dout("setattr %p mode 0%o -> 0%o\n", inode, inode->i_mode, + attr->ia_mode); + if (issued & CEPH_CAP_AUTH_EXCL) { + inode->i_mode = attr->ia_mode; + dirtied |= CEPH_CAP_AUTH_EXCL; + } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 || + attr->ia_mode != inode->i_mode) { + req->r_args.setattr.mode = cpu_to_le32(attr->ia_mode); + mask |= CEPH_SETATTR_MODE; + release |= CEPH_CAP_AUTH_SHARED; + } + } + + if (ia_valid & ATTR_ATIME) { + dout("setattr %p atime %ld.%ld -> %ld.%ld\n", inode, + inode->i_atime.tv_sec, inode->i_atime.tv_nsec, + attr->ia_atime.tv_sec, attr->ia_atime.tv_nsec); + if (issued & CEPH_CAP_FILE_EXCL) { + ci->i_time_warp_seq++; + inode->i_atime = attr->ia_atime; + dirtied |= CEPH_CAP_FILE_EXCL; + } else if ((issued & CEPH_CAP_FILE_WR) && + timespec_compare(&inode->i_atime, + &attr->ia_atime) < 0) { + inode->i_atime = attr->ia_atime; + dirtied |= CEPH_CAP_FILE_WR; + } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 || + !timespec_equal(&inode->i_atime, &attr->ia_atime)) { + ceph_encode_timespec(&req->r_args.setattr.atime, + &attr->ia_atime); + mask |= CEPH_SETATTR_ATIME; + release |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD | + CEPH_CAP_FILE_WR; + } + } + if (ia_valid & ATTR_MTIME) { + dout("setattr %p mtime %ld.%ld -> %ld.%ld\n", inode, + inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec, + attr->ia_mtime.tv_sec, attr->ia_mtime.tv_nsec); + if (issued & CEPH_CAP_FILE_EXCL) { + ci->i_time_warp_seq++; + inode->i_mtime = attr->ia_mtime; + dirtied |= CEPH_CAP_FILE_EXCL; + } else if ((issued & CEPH_CAP_FILE_WR) && + timespec_compare(&inode->i_mtime, + &attr->ia_mtime) < 0) { + inode->i_mtime = attr->ia_mtime; + dirtied |= CEPH_CAP_FILE_WR; + } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 || + !timespec_equal(&inode->i_mtime, &attr->ia_mtime)) { + ceph_encode_timespec(&req->r_args.setattr.mtime, + &attr->ia_mtime); + mask |= CEPH_SETATTR_MTIME; + release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD | + CEPH_CAP_FILE_WR; + } + } + if (ia_valid & ATTR_SIZE) { + dout("setattr %p size %lld -> %lld\n", inode, + inode->i_size, attr->ia_size); + if (attr->ia_size > inode->i_sb->s_maxbytes) { + err = -EINVAL; + goto out; + } + if ((issued & CEPH_CAP_FILE_EXCL) && + attr->ia_size > inode->i_size) { + inode->i_size = attr->ia_size; + inode->i_blocks = + (attr->ia_size + (1 << 9) - 1) >> 9; + inode->i_ctime = attr->ia_ctime; + ci->i_reported_size = attr->ia_size; + dirtied |= CEPH_CAP_FILE_EXCL; + } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 || + attr->ia_size != inode->i_size) { + req->r_args.setattr.size = cpu_to_le64(attr->ia_size); + req->r_args.setattr.old_size = + cpu_to_le64(inode->i_size); + mask |= CEPH_SETATTR_SIZE; + release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD | + CEPH_CAP_FILE_WR; + } + } + + /* these do nothing */ + if (ia_valid & ATTR_CTIME) { + bool only = (ia_valid & (ATTR_SIZE|ATTR_MTIME|ATTR_ATIME| + ATTR_MODE|ATTR_UID|ATTR_GID)) == 0; + dout("setattr %p ctime %ld.%ld -> %ld.%ld (%s)\n", inode, + inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec, + attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec, + only ? "ctime only" : "ignored"); + inode->i_ctime = attr->ia_ctime; + if (only) { + /* + * if kernel wants to dirty ctime but nothing else, + * we need to choose a cap to dirty under, or do + * a almost-no-op setattr + */ + if (issued & CEPH_CAP_AUTH_EXCL) + dirtied |= CEPH_CAP_AUTH_EXCL; + else if (issued & CEPH_CAP_FILE_EXCL) + dirtied |= CEPH_CAP_FILE_EXCL; + else if (issued & CEPH_CAP_XATTR_EXCL) + dirtied |= CEPH_CAP_XATTR_EXCL; + else + mask |= CEPH_SETATTR_CTIME; + } + } + if (ia_valid & ATTR_FILE) + dout("setattr %p ATTR_FILE ... hrm!\n", inode); + + if (dirtied) { + __ceph_mark_dirty_caps(ci, dirtied); + inode->i_ctime = CURRENT_TIME; + } + + release &= issued; + spin_unlock(&inode->i_lock); + + if (mask) { + req->r_inode = igrab(inode); + req->r_inode_drop = release; + req->r_args.setattr.mask = cpu_to_le32(mask); + req->r_num_caps = 1; + err = ceph_mdsc_do_request(mdsc, parent_inode, req); + } + dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err, + ceph_cap_string(dirtied), mask); + + ceph_mdsc_put_request(req); + __ceph_do_pending_vmtruncate(inode); + return err; +out: + spin_unlock(&inode->i_lock); + ceph_mdsc_put_request(req); + return err; +} + +/* + * Verify that we have a lease on the given mask. If not, + * do a getattr against an mds. + */ +int ceph_do_getattr(struct inode *inode, int mask) +{ + struct ceph_client *client = ceph_sb_to_client(inode->i_sb); + struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_mds_request *req; + int err; + + if (ceph_snap(inode) == CEPH_SNAPDIR) { + dout("do_getattr inode %p SNAPDIR\n", inode); + return 0; + } + + dout("do_getattr inode %p mask %s\n", inode, ceph_cap_string(mask)); + if (ceph_caps_issued_mask(ceph_inode(inode), mask, 1)) + return 0; + + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS); + if (IS_ERR(req)) + return PTR_ERR(req); + req->r_inode = igrab(inode); + req->r_num_caps = 1; + req->r_args.getattr.mask = cpu_to_le32(mask); + err = ceph_mdsc_do_request(mdsc, NULL, req); + ceph_mdsc_put_request(req); + dout("do_getattr result=%d\n", err); + return err; +} + + +/* + * Check inode permissions. We verify we have a valid value for + * the AUTH cap, then call the generic handler. + */ +int ceph_permission(struct inode *inode, int mask) +{ + int err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED); + + if (!err) + err = generic_permission(inode, mask, NULL); + return err; +} + +/* + * Get all attributes. Hopefully somedata we'll have a statlite() + * and can limit the fields we require to be accurate. + */ +int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct inode *inode = dentry->d_inode; + struct ceph_inode_info *ci = ceph_inode(inode); + int err; + + err = ceph_do_getattr(inode, CEPH_STAT_CAP_INODE_ALL); + if (!err) { + generic_fillattr(inode, stat); + stat->ino = inode->i_ino; + if (ceph_snap(inode) != CEPH_NOSNAP) + stat->dev = ceph_snap(inode); + else + stat->dev = 0; + if (S_ISDIR(inode->i_mode)) { + stat->size = ci->i_rbytes; + stat->blocks = 0; + stat->blksize = 65536; + } + } + return err; +} diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c new file mode 100644 index 00000000000..8a5bcae6284 --- /dev/null +++ b/fs/ceph/ioctl.c @@ -0,0 +1,160 @@ +#include <linux/in.h> + +#include "ioctl.h" +#include "super.h" +#include "ceph_debug.h" + + +/* + * ioctls + */ + +/* + * get and set the file layout + */ +static long ceph_ioctl_get_layout(struct file *file, void __user *arg) +{ + struct ceph_inode_info *ci = ceph_inode(file->f_dentry->d_inode); + struct ceph_ioctl_layout l; + int err; + + err = ceph_do_getattr(file->f_dentry->d_inode, CEPH_STAT_CAP_LAYOUT); + if (!err) { + l.stripe_unit = ceph_file_layout_su(ci->i_layout); + l.stripe_count = ceph_file_layout_stripe_count(ci->i_layout); + l.object_size = ceph_file_layout_object_size(ci->i_layout); + l.data_pool = le32_to_cpu(ci->i_layout.fl_pg_pool); + l.preferred_osd = + (s32)le32_to_cpu(ci->i_layout.fl_pg_preferred); + if (copy_to_user(arg, &l, sizeof(l))) + return -EFAULT; + } + + return err; +} + +static long ceph_ioctl_set_layout(struct file *file, void __user *arg) +{ + struct inode *inode = file->f_dentry->d_inode; + struct inode *parent_inode = file->f_dentry->d_parent->d_inode; + struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_mds_request *req; + struct ceph_ioctl_layout l; + int err, i; + + /* copy and validate */ + if (copy_from_user(&l, arg, sizeof(l))) + return -EFAULT; + + if ((l.object_size & ~PAGE_MASK) || + (l.stripe_unit & ~PAGE_MASK) || + !l.stripe_unit || + (l.object_size && + (unsigned)l.object_size % (unsigned)l.stripe_unit)) + return -EINVAL; + + /* make sure it's a valid data pool */ + if (l.data_pool > 0) { + mutex_lock(&mdsc->mutex); + err = -EINVAL; + for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++) + if (mdsc->mdsmap->m_data_pg_pools[i] == l.data_pool) { + err = 0; + break; + } + mutex_unlock(&mdsc->mutex); + if (err) + return err; + } + + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETLAYOUT, + USE_AUTH_MDS); + if (IS_ERR(req)) + return PTR_ERR(req); + req->r_inode = igrab(inode); + req->r_inode_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL; + + req->r_args.setlayout.layout.fl_stripe_unit = + cpu_to_le32(l.stripe_unit); + req->r_args.setlayout.layout.fl_stripe_count = + cpu_to_le32(l.stripe_count); + req->r_args.setlayout.layout.fl_object_size = + cpu_to_le32(l.object_size); + req->r_args.setlayout.layout.fl_pg_pool = cpu_to_le32(l.data_pool); + req->r_args.setlayout.layout.fl_pg_preferred = + cpu_to_le32(l.preferred_osd); + + err = ceph_mdsc_do_request(mdsc, parent_inode, req); + ceph_mdsc_put_request(req); + return err; +} + +/* + * Return object name, size/offset information, and location (OSD + * number, network address) for a given file offset. + */ +static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) +{ + struct ceph_ioctl_dataloc dl; + struct inode *inode = file->f_dentry->d_inode; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc; + u64 len = 1, olen; + u64 tmp; + struct ceph_object_layout ol; + struct ceph_pg pgid; + + /* copy and validate */ + if (copy_from_user(&dl, arg, sizeof(dl))) + return -EFAULT; + + down_read(&osdc->map_sem); + ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, &len, + &dl.object_no, &dl.object_offset, &olen); + dl.file_offset -= dl.object_offset; + dl.object_size = ceph_file_layout_object_size(ci->i_layout); + dl.block_size = ceph_file_layout_su(ci->i_layout); + + /* block_offset = object_offset % block_size */ + tmp = dl.object_offset; + dl.block_offset = do_div(tmp, dl.block_size); + + snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx", + ceph_ino(inode), dl.object_no); + ceph_calc_object_layout(&ol, dl.object_name, &ci->i_layout, + osdc->osdmap); + + pgid = ol.ol_pgid; + dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid); + if (dl.osd >= 0) { + struct ceph_entity_addr *a = + ceph_osd_addr(osdc->osdmap, dl.osd); + if (a) + memcpy(&dl.osd_addr, &a->in_addr, sizeof(dl.osd_addr)); + } else { + memset(&dl.osd_addr, 0, sizeof(dl.osd_addr)); + } + up_read(&osdc->map_sem); + + /* send result back to user */ + if (copy_to_user(arg, &dl, sizeof(dl))) + return -EFAULT; + + return 0; +} + +long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + dout("ioctl file %p cmd %u arg %lu\n", file, cmd, arg); + switch (cmd) { + case CEPH_IOC_GET_LAYOUT: + return ceph_ioctl_get_layout(file, (void __user *)arg); + + case CEPH_IOC_SET_LAYOUT: + return ceph_ioctl_set_layout(file, (void __user *)arg); + + case CEPH_IOC_GET_DATALOC: + return ceph_ioctl_get_dataloc(file, (void __user *)arg); + } + return -ENOTTY; +} diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h new file mode 100644 index 00000000000..25e4f1a9d05 --- /dev/null +++ b/fs/ceph/ioctl.h @@ -0,0 +1,40 @@ +#ifndef FS_CEPH_IOCTL_H +#define FS_CEPH_IOCTL_H + +#include <linux/ioctl.h> +#include <linux/types.h> + +#define CEPH_IOCTL_MAGIC 0x97 + +/* just use u64 to align sanely on all archs */ +struct ceph_ioctl_layout { + __u64 stripe_unit, stripe_count, object_size; + __u64 data_pool; + __s64 preferred_osd; +}; + +#define CEPH_IOC_GET_LAYOUT _IOR(CEPH_IOCTL_MAGIC, 1, \ + struct ceph_ioctl_layout) +#define CEPH_IOC_SET_LAYOUT _IOW(CEPH_IOCTL_MAGIC, 2, \ + struct ceph_ioctl_layout) + +/* + * Extract identity, address of the OSD and object storing a given + * file offset. + */ +struct ceph_ioctl_dataloc { + __u64 file_offset; /* in+out: file offset */ + __u64 object_offset; /* out: offset in object */ + __u64 object_no; /* out: object # */ + __u64 object_size; /* out: object size */ + char object_name[64]; /* out: object name */ + __u64 block_offset; /* out: offset in block */ + __u64 block_size; /* out: block length */ + __s64 osd; /* out: osd # */ + struct sockaddr_storage osd_addr; /* out: osd address */ +}; + +#define CEPH_IOC_GET_DATALOC _IOWR(CEPH_IOCTL_MAGIC, 3, \ + struct ceph_ioctl_dataloc) + +#endif diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c new file mode 100644 index 00000000000..a2600101ec2 --- /dev/null +++ b/fs/ceph/mds_client.c @@ -0,0 +1,3021 @@ +#include "ceph_debug.h" + +#include <linux/wait.h> +#include <linux/sched.h> + +#include "mds_client.h" +#include "mon_client.h" +#include "super.h" +#include "messenger.h" +#include "decode.h" +#include "auth.h" +#include "pagelist.h" + +/* + * A cluster of MDS (metadata server) daemons is responsible for + * managing the file system namespace (the directory hierarchy and + * inodes) and for coordinating shared access to storage. Metadata is + * partitioning hierarchically across a number of servers, and that + * partition varies over time as the cluster adjusts the distribution + * in order to balance load. + * + * The MDS client is primarily responsible to managing synchronous + * metadata requests for operations like open, unlink, and so forth. + * If there is a MDS failure, we find out about it when we (possibly + * request and) receive a new MDS map, and can resubmit affected + * requests. + * + * For the most part, though, we take advantage of a lossless + * communications channel to the MDS, and do not need to worry about + * timing out or resubmitting requests. + * + * We maintain a stateful "session" with each MDS we interact with. + * Within each session, we sent periodic heartbeat messages to ensure + * any capabilities or leases we have been issues remain valid. If + * the session times out and goes stale, our leases and capabilities + * are no longer valid. + */ + +static void __wake_requests(struct ceph_mds_client *mdsc, + struct list_head *head); + +const static struct ceph_connection_operations mds_con_ops; + + +/* + * mds reply parsing + */ + +/* + * parse individual inode info + */ +static int parse_reply_info_in(void **p, void *end, + struct ceph_mds_reply_info_in *info) +{ + int err = -EIO; + + info->in = *p; + *p += sizeof(struct ceph_mds_reply_inode) + + sizeof(*info->in->fragtree.splits) * + le32_to_cpu(info->in->fragtree.nsplits); + + ceph_decode_32_safe(p, end, info->symlink_len, bad); + ceph_decode_need(p, end, info->symlink_len, bad); + info->symlink = *p; + *p += info->symlink_len; + + ceph_decode_32_safe(p, end, info->xattr_len, bad); + ceph_decode_need(p, end, info->xattr_len, bad); + info->xattr_data = *p; + *p += info->xattr_len; + return 0; +bad: + return err; +} + +/* + * parse a normal reply, which may contain a (dir+)dentry and/or a + * target inode. + */ +static int parse_reply_info_trace(void **p, void *end, + struct ceph_mds_reply_info_parsed *info) +{ + int err; + + if (info->head->is_dentry) { + err = parse_reply_info_in(p, end, &info->diri); + if (err < 0) + goto out_bad; + + if (unlikely(*p + sizeof(*info->dirfrag) > end)) + goto bad; + info->dirfrag = *p; + *p += sizeof(*info->dirfrag) + + sizeof(u32)*le32_to_cpu(info->dirfrag->ndist); + if (unlikely(*p > end)) + goto bad; + + ceph_decode_32_safe(p, end, info->dname_len, bad); + ceph_decode_need(p, end, info->dname_len, bad); + info->dname = *p; + *p += info->dname_len; + info->dlease = *p; + *p += sizeof(*info->dlease); + } + + if (info->head->is_target) { + err = parse_reply_info_in(p, end, &info->targeti); + if (err < 0) + goto out_bad; + } + + if (unlikely(*p != end)) + goto bad; + return 0; + +bad: + err = -EIO; +out_bad: + pr_err("problem parsing mds trace %d\n", err); + return err; +} + +/* + * parse readdir results + */ +static int parse_reply_info_dir(void **p, void *end, + struct ceph_mds_reply_info_parsed *info) +{ + u32 num, i = 0; + int err; + + info->dir_dir = *p; + if (*p + sizeof(*info->dir_dir) > end) + goto bad; + *p += sizeof(*info->dir_dir) + + sizeof(u32)*le32_to_cpu(info->dir_dir->ndist); + if (*p > end) + goto bad; + + ceph_decode_need(p, end, sizeof(num) + 2, bad); + num = ceph_decode_32(p); + info->dir_end = ceph_decode_8(p); + info->dir_complete = ceph_decode_8(p); + if (num == 0) + goto done; + + /* alloc large array */ + info->dir_nr = num; + info->dir_in = kcalloc(num, sizeof(*info->dir_in) + + sizeof(*info->dir_dname) + + sizeof(*info->dir_dname_len) + + sizeof(*info->dir_dlease), + GFP_NOFS); + if (info->dir_in == NULL) { + err = -ENOMEM; + goto out_bad; + } + info->dir_dname = (void *)(info->dir_in + num); + info->dir_dname_len = (void *)(info->dir_dname + num); + info->dir_dlease = (void *)(info->dir_dname_len + num); + + while (num) { + /* dentry */ + ceph_decode_need(p, end, sizeof(u32)*2, bad); + info->dir_dname_len[i] = ceph_decode_32(p); + ceph_decode_need(p, end, info->dir_dname_len[i], bad); + info->dir_dname[i] = *p; + *p += info->dir_dname_len[i]; + dout("parsed dir dname '%.*s'\n", info->dir_dname_len[i], + info->dir_dname[i]); + info->dir_dlease[i] = *p; + *p += sizeof(struct ceph_mds_reply_lease); + + /* inode */ + err = parse_reply_info_in(p, end, &info->dir_in[i]); + if (err < 0) + goto out_bad; + i++; + num--; + } + +done: + if (*p != end) + goto bad; + return 0; + +bad: + err = -EIO; +out_bad: + pr_err("problem parsing dir contents %d\n", err); + return err; +} + +/* + * parse entire mds reply + */ +static int parse_reply_info(struct ceph_msg *msg, + struct ceph_mds_reply_info_parsed *info) +{ + void *p, *end; + u32 len; + int err; + + info->head = msg->front.iov_base; + p = msg->front.iov_base + sizeof(struct ceph_mds_reply_head); + end = p + msg->front.iov_len - sizeof(struct ceph_mds_reply_head); + + /* trace */ + ceph_decode_32_safe(&p, end, len, bad); + if (len > 0) { + err = parse_reply_info_trace(&p, p+len, info); + if (err < 0) + goto out_bad; + } + + /* dir content */ + ceph_decode_32_safe(&p, end, len, bad); + if (len > 0) { + err = parse_reply_info_dir(&p, p+len, info); + if (err < 0) + goto out_bad; + } + + /* snap blob */ + ceph_decode_32_safe(&p, end, len, bad); + info->snapblob_len = len; + info->snapblob = p; + p += len; + + if (p != end) + goto bad; + return 0; + +bad: + err = -EIO; +out_bad: + pr_err("mds parse_reply err %d\n", err); + return err; +} + +static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info) +{ + kfree(info->dir_in); +} + + +/* + * sessions + */ +static const char *session_state_name(int s) +{ + switch (s) { + case CEPH_MDS_SESSION_NEW: return "new"; + case CEPH_MDS_SESSION_OPENING: return "opening"; + case CEPH_MDS_SESSION_OPEN: return "open"; + case CEPH_MDS_SESSION_HUNG: return "hung"; + case CEPH_MDS_SESSION_CLOSING: return "closing"; + case CEPH_MDS_SESSION_RESTARTING: return "restarting"; + case CEPH_MDS_SESSION_RECONNECTING: return "reconnecting"; + default: return "???"; + } +} + +static struct ceph_mds_session *get_session(struct ceph_mds_session *s) +{ + if (atomic_inc_not_zero(&s->s_ref)) { + dout("mdsc get_session %p %d -> %d\n", s, + atomic_read(&s->s_ref)-1, atomic_read(&s->s_ref)); + return s; + } else { + dout("mdsc get_session %p 0 -- FAIL", s); + return NULL; + } +} + +void ceph_put_mds_session(struct ceph_mds_session *s) +{ + dout("mdsc put_session %p %d -> %d\n", s, + atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1); + if (atomic_dec_and_test(&s->s_ref)) { + if (s->s_authorizer) + s->s_mdsc->client->monc.auth->ops->destroy_authorizer( + s->s_mdsc->client->monc.auth, s->s_authorizer); + kfree(s); + } +} + +/* + * called under mdsc->mutex + */ +struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc, + int mds) +{ + struct ceph_mds_session *session; + + if (mds >= mdsc->max_sessions || mdsc->sessions[mds] == NULL) + return NULL; + session = mdsc->sessions[mds]; + dout("lookup_mds_session %p %d\n", session, + atomic_read(&session->s_ref)); + get_session(session); + return session; +} + +static bool __have_session(struct ceph_mds_client *mdsc, int mds) +{ + if (mds >= mdsc->max_sessions) + return false; + return mdsc->sessions[mds]; +} + +static int __verify_registered_session(struct ceph_mds_client *mdsc, + struct ceph_mds_session *s) +{ + if (s->s_mds >= mdsc->max_sessions || + mdsc->sessions[s->s_mds] != s) + return -ENOENT; + return 0; +} + +/* + * create+register a new session for given mds. + * called under mdsc->mutex. + */ +static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, + int mds) +{ + struct ceph_mds_session *s; + + s = kzalloc(sizeof(*s), GFP_NOFS); + s->s_mdsc = mdsc; + s->s_mds = mds; + s->s_state = CEPH_MDS_SESSION_NEW; + s->s_ttl = 0; + s->s_seq = 0; + mutex_init(&s->s_mutex); + + ceph_con_init(mdsc->client->msgr, &s->s_con); + s->s_con.private = s; + s->s_con.ops = &mds_con_ops; + s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS; + s->s_con.peer_name.num = cpu_to_le64(mds); + + spin_lock_init(&s->s_cap_lock); + s->s_cap_gen = 0; + s->s_cap_ttl = 0; + s->s_renew_requested = 0; + s->s_renew_seq = 0; + INIT_LIST_HEAD(&s->s_caps); + s->s_nr_caps = 0; + s->s_trim_caps = 0; + atomic_set(&s->s_ref, 1); + INIT_LIST_HEAD(&s->s_waiting); + INIT_LIST_HEAD(&s->s_unsafe); + s->s_num_cap_releases = 0; + s->s_cap_iterator = NULL; + INIT_LIST_HEAD(&s->s_cap_releases); + INIT_LIST_HEAD(&s->s_cap_releases_done); + INIT_LIST_HEAD(&s->s_cap_flushing); + INIT_LIST_HEAD(&s->s_cap_snaps_flushing); + + dout("register_session mds%d\n", mds); + if (mds >= mdsc->max_sessions) { + int newmax = 1 << get_count_order(mds+1); + struct ceph_mds_session **sa; + + dout("register_session realloc to %d\n", newmax); + sa = kcalloc(newmax, sizeof(void *), GFP_NOFS); + if (sa == NULL) + goto fail_realloc; + if (mdsc->sessions) { + memcpy(sa, mdsc->sessions, + mdsc->max_sessions * sizeof(void *)); + kfree(mdsc->sessions); + } + mdsc->sessions = sa; + mdsc->max_sessions = newmax; + } + mdsc->sessions[mds] = s; + atomic_inc(&s->s_ref); /* one ref to sessions[], one to caller */ + + ceph_con_open(&s->s_con, ceph_mdsmap_get_addr(mdsc->mdsmap, mds)); + + return s; + +fail_realloc: + kfree(s); + return ERR_PTR(-ENOMEM); +} + +/* + * called under mdsc->mutex + */ +static void __unregister_session(struct ceph_mds_client *mdsc, + struct ceph_mds_session *s) +{ + dout("__unregister_session mds%d %p\n", s->s_mds, s); + BUG_ON(mdsc->sessions[s->s_mds] != s); + mdsc->sessions[s->s_mds] = NULL; + ceph_con_close(&s->s_con); + ceph_put_mds_session(s); +} + +/* + * drop session refs in request. + * + * should be last request ref, or hold mdsc->mutex + */ +static void put_request_session(struct ceph_mds_request *req) +{ + if (req->r_session) { + ceph_put_mds_session(req->r_session); + req->r_session = NULL; + } +} + +void ceph_mdsc_release_request(struct kref *kref) +{ + struct ceph_mds_request *req = container_of(kref, + struct ceph_mds_request, + r_kref); + if (req->r_request) + ceph_msg_put(req->r_request); + if (req->r_reply) { + ceph_msg_put(req->r_reply); + destroy_reply_info(&req->r_reply_info); + } + if (req->r_inode) { + ceph_put_cap_refs(ceph_inode(req->r_inode), + CEPH_CAP_PIN); + iput(req->r_inode); + } + if (req->r_locked_dir) + ceph_put_cap_refs(ceph_inode(req->r_locked_dir), + CEPH_CAP_PIN); + if (req->r_target_inode) + iput(req->r_target_inode); + if (req->r_dentry) + dput(req->r_dentry); + if (req->r_old_dentry) { + ceph_put_cap_refs( + ceph_inode(req->r_old_dentry->d_parent->d_inode), + CEPH_CAP_PIN); + dput(req->r_old_dentry); + } + kfree(req->r_path1); + kfree(req->r_path2); + put_request_session(req); + ceph_unreserve_caps(&req->r_caps_reservation); + kfree(req); +} + +/* + * lookup session, bump ref if found. + * + * called under mdsc->mutex. + */ +static struct ceph_mds_request *__lookup_request(struct ceph_mds_client *mdsc, + u64 tid) +{ + struct ceph_mds_request *req; + struct rb_node *n = mdsc->request_tree.rb_node; + + while (n) { + req = rb_entry(n, struct ceph_mds_request, r_node); + if (tid < req->r_tid) + n = n->rb_left; + else if (tid > req->r_tid) + n = n->rb_right; + else { + ceph_mdsc_get_request(req); + return req; + } + } + return NULL; +} + +static void __insert_request(struct ceph_mds_client *mdsc, + struct ceph_mds_request *new) +{ + struct rb_node **p = &mdsc->request_tree.rb_node; + struct rb_node *parent = NULL; + struct ceph_mds_request *req = NULL; + + while (*p) { + parent = *p; + req = rb_entry(parent, struct ceph_mds_request, r_node); + if (new->r_tid < req->r_tid) + p = &(*p)->rb_left; + else if (new->r_tid > req->r_tid) + p = &(*p)->rb_right; + else + BUG(); + } + + rb_link_node(&new->r_node, parent, p); + rb_insert_color(&new->r_node, &mdsc->request_tree); +} + +/* + * Register an in-flight request, and assign a tid. Link to directory + * are modifying (if any). + * + * Called under mdsc->mutex. + */ +static void __register_request(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req, + struct inode *dir) +{ + req->r_tid = ++mdsc->last_tid; + if (req->r_num_caps) + ceph_reserve_caps(&req->r_caps_reservation, req->r_num_caps); + dout("__register_request %p tid %lld\n", req, req->r_tid); + ceph_mdsc_get_request(req); + __insert_request(mdsc, req); + + if (dir) { + struct ceph_inode_info *ci = ceph_inode(dir); + + spin_lock(&ci->i_unsafe_lock); + req->r_unsafe_dir = dir; + list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops); + spin_unlock(&ci->i_unsafe_lock); + } +} + +static void __unregister_request(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req) +{ + dout("__unregister_request %p tid %lld\n", req, req->r_tid); + rb_erase(&req->r_node, &mdsc->request_tree); + ceph_mdsc_put_request(req); + + if (req->r_unsafe_dir) { + struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir); + + spin_lock(&ci->i_unsafe_lock); + list_del_init(&req->r_unsafe_dir_item); + spin_unlock(&ci->i_unsafe_lock); + } +} + +/* + * Choose mds to send request to next. If there is a hint set in the + * request (e.g., due to a prior forward hint from the mds), use that. + * Otherwise, consult frag tree and/or caps to identify the + * appropriate mds. If all else fails, choose randomly. + * + * Called under mdsc->mutex. + */ +static int __choose_mds(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req) +{ + struct inode *inode; + struct ceph_inode_info *ci; + struct ceph_cap *cap; + int mode = req->r_direct_mode; + int mds = -1; + u32 hash = req->r_direct_hash; + bool is_hash = req->r_direct_is_hash; + + /* + * is there a specific mds we should try? ignore hint if we have + * no session and the mds is not up (active or recovering). + */ + if (req->r_resend_mds >= 0 && + (__have_session(mdsc, req->r_resend_mds) || + ceph_mdsmap_get_state(mdsc->mdsmap, req->r_resend_mds) > 0)) { + dout("choose_mds using resend_mds mds%d\n", + req->r_resend_mds); + return req->r_resend_mds; + } + + if (mode == USE_RANDOM_MDS) + goto random; + + inode = NULL; + if (req->r_inode) { + inode = req->r_inode; + } else if (req->r_dentry) { + if (req->r_dentry->d_inode) { + inode = req->r_dentry->d_inode; + } else { + inode = req->r_dentry->d_parent->d_inode; + hash = req->r_dentry->d_name.hash; + is_hash = true; + } + } + dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash, + (int)hash, mode); + if (!inode) + goto random; + ci = ceph_inode(inode); + + if (is_hash && S_ISDIR(inode->i_mode)) { + struct ceph_inode_frag frag; + int found; + + ceph_choose_frag(ci, hash, &frag, &found); + if (found) { + if (mode == USE_ANY_MDS && frag.ndist > 0) { + u8 r; + + /* choose a random replica */ + get_random_bytes(&r, 1); + r %= frag.ndist; + mds = frag.dist[r]; + dout("choose_mds %p %llx.%llx " + "frag %u mds%d (%d/%d)\n", + inode, ceph_vinop(inode), + frag.frag, frag.mds, + (int)r, frag.ndist); + return mds; + } + + /* since this file/dir wasn't known to be + * replicated, then we want to look for the + * authoritative mds. */ + mode = USE_AUTH_MDS; + if (frag.mds >= 0) { + /* choose auth mds */ + mds = frag.mds; + dout("choose_mds %p %llx.%llx " + "frag %u mds%d (auth)\n", + inode, ceph_vinop(inode), frag.frag, mds); + return mds; + } + } + } + + spin_lock(&inode->i_lock); + cap = NULL; + if (mode == USE_AUTH_MDS) + cap = ci->i_auth_cap; + if (!cap && !RB_EMPTY_ROOT(&ci->i_caps)) + cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node); + if (!cap) { + spin_unlock(&inode->i_lock); + goto random; + } + mds = cap->session->s_mds; + dout("choose_mds %p %llx.%llx mds%d (%scap %p)\n", + inode, ceph_vinop(inode), mds, + cap == ci->i_auth_cap ? "auth " : "", cap); + spin_unlock(&inode->i_lock); + return mds; + +random: + mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap); + dout("choose_mds chose random mds%d\n", mds); + return mds; +} + + +/* + * session messages + */ +static struct ceph_msg *create_session_msg(u32 op, u64 seq) +{ + struct ceph_msg *msg; + struct ceph_mds_session_head *h; + + msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), 0, 0, NULL); + if (IS_ERR(msg)) { + pr_err("create_session_msg ENOMEM creating msg\n"); + return ERR_PTR(PTR_ERR(msg)); + } + h = msg->front.iov_base; + h->op = cpu_to_le32(op); + h->seq = cpu_to_le64(seq); + return msg; +} + +/* + * send session open request. + * + * called under mdsc->mutex + */ +static int __open_session(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + struct ceph_msg *msg; + int mstate; + int mds = session->s_mds; + int err = 0; + + /* wait for mds to go active? */ + mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds); + dout("open_session to mds%d (%s)\n", mds, + ceph_mds_state_name(mstate)); + session->s_state = CEPH_MDS_SESSION_OPENING; + session->s_renew_requested = jiffies; + + /* send connect message */ + msg = create_session_msg(CEPH_SESSION_REQUEST_OPEN, session->s_seq); + if (IS_ERR(msg)) { + err = PTR_ERR(msg); + goto out; + } + ceph_con_send(&session->s_con, msg); + +out: + return 0; +} + +/* + * session caps + */ + +/* + * Free preallocated cap messages assigned to this session + */ +static void cleanup_cap_releases(struct ceph_mds_session *session) +{ + struct ceph_msg *msg; + + spin_lock(&session->s_cap_lock); + while (!list_empty(&session->s_cap_releases)) { + msg = list_first_entry(&session->s_cap_releases, + struct ceph_msg, list_head); + list_del_init(&msg->list_head); + ceph_msg_put(msg); + } + while (!list_empty(&session->s_cap_releases_done)) { + msg = list_first_entry(&session->s_cap_releases_done, + struct ceph_msg, list_head); + list_del_init(&msg->list_head); + ceph_msg_put(msg); + } + spin_unlock(&session->s_cap_lock); +} + +/* + * Helper to safely iterate over all caps associated with a session. + * + * caller must hold session s_mutex + */ +static int iterate_session_caps(struct ceph_mds_session *session, + int (*cb)(struct inode *, struct ceph_cap *, + void *), void *arg) +{ + struct list_head *p; + struct ceph_cap *cap; + struct inode *inode, *last_inode = NULL; + struct ceph_cap *old_cap = NULL; + int ret; + + dout("iterate_session_caps %p mds%d\n", session, session->s_mds); + spin_lock(&session->s_cap_lock); + p = session->s_caps.next; + while (p != &session->s_caps) { + cap = list_entry(p, struct ceph_cap, session_caps); + inode = igrab(&cap->ci->vfs_inode); + if (!inode) { + p = p->next; + continue; + } + session->s_cap_iterator = cap; + spin_unlock(&session->s_cap_lock); + + if (last_inode) { + iput(last_inode); + last_inode = NULL; + } + if (old_cap) { + ceph_put_cap(old_cap); + old_cap = NULL; + } + + ret = cb(inode, cap, arg); + last_inode = inode; + + spin_lock(&session->s_cap_lock); + p = p->next; + if (cap->ci == NULL) { + dout("iterate_session_caps finishing cap %p removal\n", + cap); + BUG_ON(cap->session != session); + list_del_init(&cap->session_caps); + session->s_nr_caps--; + cap->session = NULL; + old_cap = cap; /* put_cap it w/o locks held */ + } + if (ret < 0) + goto out; + } + ret = 0; +out: + session->s_cap_iterator = NULL; + spin_unlock(&session->s_cap_lock); + + if (last_inode) + iput(last_inode); + if (old_cap) + ceph_put_cap(old_cap); + + return ret; +} + +static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, + void *arg) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + dout("removing cap %p, ci is %p, inode is %p\n", + cap, ci, &ci->vfs_inode); + ceph_remove_cap(cap); + return 0; +} + +/* + * caller must hold session s_mutex + */ +static void remove_session_caps(struct ceph_mds_session *session) +{ + dout("remove_session_caps on %p\n", session); + iterate_session_caps(session, remove_session_caps_cb, NULL); + BUG_ON(session->s_nr_caps > 0); + cleanup_cap_releases(session); +} + +/* + * wake up any threads waiting on this session's caps. if the cap is + * old (didn't get renewed on the client reconnect), remove it now. + * + * caller must hold s_mutex. + */ +static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap, + void *arg) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + wake_up(&ci->i_cap_wq); + if (arg) { + spin_lock(&inode->i_lock); + ci->i_wanted_max_size = 0; + ci->i_requested_max_size = 0; + spin_unlock(&inode->i_lock); + } + return 0; +} + +static void wake_up_session_caps(struct ceph_mds_session *session, + int reconnect) +{ + dout("wake_up_session_caps %p mds%d\n", session, session->s_mds); + iterate_session_caps(session, wake_up_session_cb, + (void *)(unsigned long)reconnect); +} + +/* + * Send periodic message to MDS renewing all currently held caps. The + * ack will reset the expiration for all caps from this session. + * + * caller holds s_mutex + */ +static int send_renew_caps(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + struct ceph_msg *msg; + int state; + + if (time_after_eq(jiffies, session->s_cap_ttl) && + time_after_eq(session->s_cap_ttl, session->s_renew_requested)) + pr_info("mds%d caps stale\n", session->s_mds); + + /* do not try to renew caps until a recovering mds has reconnected + * with its clients. */ + state = ceph_mdsmap_get_state(mdsc->mdsmap, session->s_mds); + if (state < CEPH_MDS_STATE_RECONNECT) { + dout("send_renew_caps ignoring mds%d (%s)\n", + session->s_mds, ceph_mds_state_name(state)); + return 0; + } + + dout("send_renew_caps to mds%d (%s)\n", session->s_mds, + ceph_mds_state_name(state)); + session->s_renew_requested = jiffies; + msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS, + ++session->s_renew_seq); + if (IS_ERR(msg)) + return PTR_ERR(msg); + ceph_con_send(&session->s_con, msg); + return 0; +} + +/* + * Note new cap ttl, and any transition from stale -> not stale (fresh?). + * + * Called under session->s_mutex + */ +static void renewed_caps(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, int is_renew) +{ + int was_stale; + int wake = 0; + + spin_lock(&session->s_cap_lock); + was_stale = is_renew && (session->s_cap_ttl == 0 || + time_after_eq(jiffies, session->s_cap_ttl)); + + session->s_cap_ttl = session->s_renew_requested + + mdsc->mdsmap->m_session_timeout*HZ; + + if (was_stale) { + if (time_before(jiffies, session->s_cap_ttl)) { + pr_info("mds%d caps renewed\n", session->s_mds); + wake = 1; + } else { + pr_info("mds%d caps still stale\n", session->s_mds); + } + } + dout("renewed_caps mds%d ttl now %lu, was %s, now %s\n", + session->s_mds, session->s_cap_ttl, was_stale ? "stale" : "fresh", + time_before(jiffies, session->s_cap_ttl) ? "stale" : "fresh"); + spin_unlock(&session->s_cap_lock); + + if (wake) + wake_up_session_caps(session, 0); +} + +/* + * send a session close request + */ +static int request_close_session(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + struct ceph_msg *msg; + int err = 0; + + dout("request_close_session mds%d state %s seq %lld\n", + session->s_mds, session_state_name(session->s_state), + session->s_seq); + msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq); + if (IS_ERR(msg)) + err = PTR_ERR(msg); + else + ceph_con_send(&session->s_con, msg); + return err; +} + +/* + * Called with s_mutex held. + */ +static int __close_session(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + if (session->s_state >= CEPH_MDS_SESSION_CLOSING) + return 0; + session->s_state = CEPH_MDS_SESSION_CLOSING; + return request_close_session(mdsc, session); +} + +/* + * Trim old(er) caps. + * + * Because we can't cache an inode without one or more caps, we do + * this indirectly: if a cap is unused, we prune its aliases, at which + * point the inode will hopefully get dropped to. + * + * Yes, this is a bit sloppy. Our only real goal here is to respond to + * memory pressure from the MDS, though, so it needn't be perfect. + */ +static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) +{ + struct ceph_mds_session *session = arg; + struct ceph_inode_info *ci = ceph_inode(inode); + int used, oissued, mine; + + if (session->s_trim_caps <= 0) + return -1; + + spin_lock(&inode->i_lock); + mine = cap->issued | cap->implemented; + used = __ceph_caps_used(ci); + oissued = __ceph_caps_issued_other(ci, cap); + + dout("trim_caps_cb %p cap %p mine %s oissued %s used %s\n", + inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued), + ceph_cap_string(used)); + if (ci->i_dirty_caps) + goto out; /* dirty caps */ + if ((used & ~oissued) & mine) + goto out; /* we need these caps */ + + session->s_trim_caps--; + if (oissued) { + /* we aren't the only cap.. just remove us */ + __ceph_remove_cap(cap); + } else { + /* try to drop referring dentries */ + spin_unlock(&inode->i_lock); + d_prune_aliases(inode); + dout("trim_caps_cb %p cap %p pruned, count now %d\n", + inode, cap, atomic_read(&inode->i_count)); + return 0; + } + +out: + spin_unlock(&inode->i_lock); + return 0; +} + +/* + * Trim session cap count down to some max number. + */ +static int trim_caps(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, + int max_caps) +{ + int trim_caps = session->s_nr_caps - max_caps; + + dout("trim_caps mds%d start: %d / %d, trim %d\n", + session->s_mds, session->s_nr_caps, max_caps, trim_caps); + if (trim_caps > 0) { + session->s_trim_caps = trim_caps; + iterate_session_caps(session, trim_caps_cb, session); + dout("trim_caps mds%d done: %d / %d, trimmed %d\n", + session->s_mds, session->s_nr_caps, max_caps, + trim_caps - session->s_trim_caps); + session->s_trim_caps = 0; + } + return 0; +} + +/* + * Allocate cap_release messages. If there is a partially full message + * in the queue, try to allocate enough to cover it's remainder, so that + * we can send it immediately. + * + * Called under s_mutex. + */ +static int add_cap_releases(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, + int extra) +{ + struct ceph_msg *msg; + struct ceph_mds_cap_release *head; + int err = -ENOMEM; + + if (extra < 0) + extra = mdsc->client->mount_args->cap_release_safety; + + spin_lock(&session->s_cap_lock); + + if (!list_empty(&session->s_cap_releases)) { + msg = list_first_entry(&session->s_cap_releases, + struct ceph_msg, + list_head); + head = msg->front.iov_base; + extra += CEPH_CAPS_PER_RELEASE - le32_to_cpu(head->num); + } + + while (session->s_num_cap_releases < session->s_nr_caps + extra) { + spin_unlock(&session->s_cap_lock); + msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE, + 0, 0, NULL); + if (!msg) + goto out_unlocked; + dout("add_cap_releases %p msg %p now %d\n", session, msg, + (int)msg->front.iov_len); + head = msg->front.iov_base; + head->num = cpu_to_le32(0); + msg->front.iov_len = sizeof(*head); + spin_lock(&session->s_cap_lock); + list_add(&msg->list_head, &session->s_cap_releases); + session->s_num_cap_releases += CEPH_CAPS_PER_RELEASE; + } + + if (!list_empty(&session->s_cap_releases)) { + msg = list_first_entry(&session->s_cap_releases, + struct ceph_msg, + list_head); + head = msg->front.iov_base; + if (head->num) { + dout(" queueing non-full %p (%d)\n", msg, + le32_to_cpu(head->num)); + list_move_tail(&msg->list_head, + &session->s_cap_releases_done); + session->s_num_cap_releases -= + CEPH_CAPS_PER_RELEASE - le32_to_cpu(head->num); + } + } + err = 0; + spin_unlock(&session->s_cap_lock); +out_unlocked: + return err; +} + +/* + * flush all dirty inode data to disk. + * + * returns true if we've flushed through want_flush_seq + */ +static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq) +{ + int mds, ret = 1; + + dout("check_cap_flush want %lld\n", want_flush_seq); + mutex_lock(&mdsc->mutex); + for (mds = 0; ret && mds < mdsc->max_sessions; mds++) { + struct ceph_mds_session *session = mdsc->sessions[mds]; + + if (!session) + continue; + get_session(session); + mutex_unlock(&mdsc->mutex); + + mutex_lock(&session->s_mutex); + if (!list_empty(&session->s_cap_flushing)) { + struct ceph_inode_info *ci = + list_entry(session->s_cap_flushing.next, + struct ceph_inode_info, + i_flushing_item); + struct inode *inode = &ci->vfs_inode; + + spin_lock(&inode->i_lock); + if (ci->i_cap_flush_seq <= want_flush_seq) { + dout("check_cap_flush still flushing %p " + "seq %lld <= %lld to mds%d\n", inode, + ci->i_cap_flush_seq, want_flush_seq, + session->s_mds); + ret = 0; + } + spin_unlock(&inode->i_lock); + } + mutex_unlock(&session->s_mutex); + ceph_put_mds_session(session); + + if (!ret) + return ret; + mutex_lock(&mdsc->mutex); + } + + mutex_unlock(&mdsc->mutex); + dout("check_cap_flush ok, flushed thru %lld\n", want_flush_seq); + return ret; +} + +/* + * called under s_mutex + */ +static void send_cap_releases(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + struct ceph_msg *msg; + + dout("send_cap_releases mds%d\n", session->s_mds); + while (1) { + spin_lock(&session->s_cap_lock); + if (list_empty(&session->s_cap_releases_done)) + break; + msg = list_first_entry(&session->s_cap_releases_done, + struct ceph_msg, list_head); + list_del_init(&msg->list_head); + spin_unlock(&session->s_cap_lock); + msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); + dout("send_cap_releases mds%d %p\n", session->s_mds, msg); + ceph_con_send(&session->s_con, msg); + } + spin_unlock(&session->s_cap_lock); +} + +/* + * requests + */ + +/* + * Create an mds request. + */ +struct ceph_mds_request * +ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode) +{ + struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS); + + if (!req) + return ERR_PTR(-ENOMEM); + + req->r_started = jiffies; + req->r_resend_mds = -1; + INIT_LIST_HEAD(&req->r_unsafe_dir_item); + req->r_fmode = -1; + kref_init(&req->r_kref); + INIT_LIST_HEAD(&req->r_wait); + init_completion(&req->r_completion); + init_completion(&req->r_safe_completion); + INIT_LIST_HEAD(&req->r_unsafe_item); + + req->r_op = op; + req->r_direct_mode = mode; + return req; +} + +/* + * return oldest (lowest) request, tid in request tree, 0 if none. + * + * called under mdsc->mutex. + */ +static struct ceph_mds_request *__get_oldest_req(struct ceph_mds_client *mdsc) +{ + if (RB_EMPTY_ROOT(&mdsc->request_tree)) + return NULL; + return rb_entry(rb_first(&mdsc->request_tree), + struct ceph_mds_request, r_node); +} + +static u64 __get_oldest_tid(struct ceph_mds_client *mdsc) +{ + struct ceph_mds_request *req = __get_oldest_req(mdsc); + + if (req) + return req->r_tid; + return 0; +} + +/* + * Build a dentry's path. Allocate on heap; caller must kfree. Based + * on build_path_from_dentry in fs/cifs/dir.c. + * + * If @stop_on_nosnap, generate path relative to the first non-snapped + * inode. + * + * Encode hidden .snap dirs as a double /, i.e. + * foo/.snap/bar -> foo//bar + */ +char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base, + int stop_on_nosnap) +{ + struct dentry *temp; + char *path; + int len, pos; + + if (dentry == NULL) + return ERR_PTR(-EINVAL); + +retry: + len = 0; + for (temp = dentry; !IS_ROOT(temp);) { + struct inode *inode = temp->d_inode; + if (inode && ceph_snap(inode) == CEPH_SNAPDIR) + len++; /* slash only */ + else if (stop_on_nosnap && inode && + ceph_snap(inode) == CEPH_NOSNAP) + break; + else + len += 1 + temp->d_name.len; + temp = temp->d_parent; + if (temp == NULL) { + pr_err("build_path_dentry corrupt dentry %p\n", dentry); + return ERR_PTR(-EINVAL); + } + } + if (len) + len--; /* no leading '/' */ + + path = kmalloc(len+1, GFP_NOFS); + if (path == NULL) + return ERR_PTR(-ENOMEM); + pos = len; + path[pos] = 0; /* trailing null */ + for (temp = dentry; !IS_ROOT(temp) && pos != 0; ) { + struct inode *inode = temp->d_inode; + + if (inode && ceph_snap(inode) == CEPH_SNAPDIR) { + dout("build_path_dentry path+%d: %p SNAPDIR\n", + pos, temp); + } else if (stop_on_nosnap && inode && + ceph_snap(inode) == CEPH_NOSNAP) { + break; + } else { + pos -= temp->d_name.len; + if (pos < 0) + break; + strncpy(path + pos, temp->d_name.name, + temp->d_name.len); + dout("build_path_dentry path+%d: %p '%.*s'\n", + pos, temp, temp->d_name.len, path + pos); + } + if (pos) + path[--pos] = '/'; + temp = temp->d_parent; + if (temp == NULL) { + pr_err("build_path_dentry corrupt dentry\n"); + kfree(path); + return ERR_PTR(-EINVAL); + } + } + if (pos != 0) { + pr_err("build_path_dentry did not end path lookup where " + "expected, namelen is %d, pos is %d\n", len, pos); + /* presumably this is only possible if racing with a + rename of one of the parent directories (we can not + lock the dentries above us to prevent this, but + retrying should be harmless) */ + kfree(path); + goto retry; + } + + *base = ceph_ino(temp->d_inode); + *plen = len; + dout("build_path_dentry on %p %d built %llx '%.*s'\n", + dentry, atomic_read(&dentry->d_count), *base, len, path); + return path; +} + +static int build_dentry_path(struct dentry *dentry, + const char **ppath, int *ppathlen, u64 *pino, + int *pfreepath) +{ + char *path; + + if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) { + *pino = ceph_ino(dentry->d_parent->d_inode); + *ppath = dentry->d_name.name; + *ppathlen = dentry->d_name.len; + return 0; + } + path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1); + if (IS_ERR(path)) + return PTR_ERR(path); + *ppath = path; + *pfreepath = 1; + return 0; +} + +static int build_inode_path(struct inode *inode, + const char **ppath, int *ppathlen, u64 *pino, + int *pfreepath) +{ + struct dentry *dentry; + char *path; + + if (ceph_snap(inode) == CEPH_NOSNAP) { + *pino = ceph_ino(inode); + *ppathlen = 0; + return 0; + } + dentry = d_find_alias(inode); + path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1); + dput(dentry); + if (IS_ERR(path)) + return PTR_ERR(path); + *ppath = path; + *pfreepath = 1; + return 0; +} + +/* + * request arguments may be specified via an inode *, a dentry *, or + * an explicit ino+path. + */ +static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry, + const char *rpath, u64 rino, + const char **ppath, int *pathlen, + u64 *ino, int *freepath) +{ + int r = 0; + + if (rinode) { + r = build_inode_path(rinode, ppath, pathlen, ino, freepath); + dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode), + ceph_snap(rinode)); + } else if (rdentry) { + r = build_dentry_path(rdentry, ppath, pathlen, ino, freepath); + dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen, + *ppath); + } else if (rpath) { + *ino = rino; + *ppath = rpath; + *pathlen = strlen(rpath); + dout(" path %.*s\n", *pathlen, rpath); + } + + return r; +} + +/* + * called under mdsc->mutex + */ +static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req, + int mds) +{ + struct ceph_msg *msg; + struct ceph_mds_request_head *head; + const char *path1 = NULL; + const char *path2 = NULL; + u64 ino1 = 0, ino2 = 0; + int pathlen1 = 0, pathlen2 = 0; + int freepath1 = 0, freepath2 = 0; + int len; + u16 releases; + void *p, *end; + int ret; + + ret = set_request_path_attr(req->r_inode, req->r_dentry, + req->r_path1, req->r_ino1.ino, + &path1, &pathlen1, &ino1, &freepath1); + if (ret < 0) { + msg = ERR_PTR(ret); + goto out; + } + + ret = set_request_path_attr(NULL, req->r_old_dentry, + req->r_path2, req->r_ino2.ino, + &path2, &pathlen2, &ino2, &freepath2); + if (ret < 0) { + msg = ERR_PTR(ret); + goto out_free1; + } + + len = sizeof(*head) + + pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)); + + /* calculate (max) length for cap releases */ + len += sizeof(struct ceph_mds_request_release) * + (!!req->r_inode_drop + !!req->r_dentry_drop + + !!req->r_old_inode_drop + !!req->r_old_dentry_drop); + if (req->r_dentry_drop) + len += req->r_dentry->d_name.len; + if (req->r_old_dentry_drop) + len += req->r_old_dentry->d_name.len; + + msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, 0, 0, NULL); + if (IS_ERR(msg)) + goto out_free2; + + msg->hdr.tid = cpu_to_le64(req->r_tid); + + head = msg->front.iov_base; + p = msg->front.iov_base + sizeof(*head); + end = msg->front.iov_base + msg->front.iov_len; + + head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch); + head->op = cpu_to_le32(req->r_op); + head->caller_uid = cpu_to_le32(current_fsuid()); + head->caller_gid = cpu_to_le32(current_fsgid()); + head->args = req->r_args; + + ceph_encode_filepath(&p, end, ino1, path1); + ceph_encode_filepath(&p, end, ino2, path2); + + /* cap releases */ + releases = 0; + if (req->r_inode_drop) + releases += ceph_encode_inode_release(&p, + req->r_inode ? req->r_inode : req->r_dentry->d_inode, + mds, req->r_inode_drop, req->r_inode_unless, 0); + if (req->r_dentry_drop) + releases += ceph_encode_dentry_release(&p, req->r_dentry, + mds, req->r_dentry_drop, req->r_dentry_unless); + if (req->r_old_dentry_drop) + releases += ceph_encode_dentry_release(&p, req->r_old_dentry, + mds, req->r_old_dentry_drop, req->r_old_dentry_unless); + if (req->r_old_inode_drop) + releases += ceph_encode_inode_release(&p, + req->r_old_dentry->d_inode, + mds, req->r_old_inode_drop, req->r_old_inode_unless, 0); + head->num_releases = cpu_to_le16(releases); + + BUG_ON(p > end); + msg->front.iov_len = p - msg->front.iov_base; + msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); + + msg->pages = req->r_pages; + msg->nr_pages = req->r_num_pages; + msg->hdr.data_len = cpu_to_le32(req->r_data_len); + msg->hdr.data_off = cpu_to_le16(0); + +out_free2: + if (freepath2) + kfree((char *)path2); +out_free1: + if (freepath1) + kfree((char *)path1); +out: + return msg; +} + +/* + * called under mdsc->mutex if error, under no mutex if + * success. + */ +static void complete_request(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req) +{ + if (req->r_callback) + req->r_callback(mdsc, req); + else + complete(&req->r_completion); +} + +/* + * called under mdsc->mutex + */ +static int __prepare_send_request(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req, + int mds) +{ + struct ceph_mds_request_head *rhead; + struct ceph_msg *msg; + int flags = 0; + + req->r_mds = mds; + req->r_attempts++; + dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req, + req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts); + + if (req->r_request) { + ceph_msg_put(req->r_request); + req->r_request = NULL; + } + msg = create_request_message(mdsc, req, mds); + if (IS_ERR(msg)) { + req->r_reply = ERR_PTR(PTR_ERR(msg)); + complete_request(mdsc, req); + return -PTR_ERR(msg); + } + req->r_request = msg; + + rhead = msg->front.iov_base; + rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc)); + if (req->r_got_unsafe) + flags |= CEPH_MDS_FLAG_REPLAY; + if (req->r_locked_dir) + flags |= CEPH_MDS_FLAG_WANT_DENTRY; + rhead->flags = cpu_to_le32(flags); + rhead->num_fwd = req->r_num_fwd; + rhead->num_retry = req->r_attempts - 1; + + dout(" r_locked_dir = %p\n", req->r_locked_dir); + + if (req->r_target_inode && req->r_got_unsafe) + rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode)); + else + rhead->ino = 0; + return 0; +} + +/* + * send request, or put it on the appropriate wait list. + */ +static int __do_request(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req) +{ + struct ceph_mds_session *session = NULL; + int mds = -1; + int err = -EAGAIN; + + if (req->r_reply) + goto out; + + if (req->r_timeout && + time_after_eq(jiffies, req->r_started + req->r_timeout)) { + dout("do_request timed out\n"); + err = -EIO; + goto finish; + } + + mds = __choose_mds(mdsc, req); + if (mds < 0 || + ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) { + dout("do_request no mds or not active, waiting for map\n"); + list_add(&req->r_wait, &mdsc->waiting_for_map); + goto out; + } + + /* get, open session */ + session = __ceph_lookup_mds_session(mdsc, mds); + if (!session) + session = register_session(mdsc, mds); + dout("do_request mds%d session %p state %s\n", mds, session, + session_state_name(session->s_state)); + if (session->s_state != CEPH_MDS_SESSION_OPEN && + session->s_state != CEPH_MDS_SESSION_HUNG) { + if (session->s_state == CEPH_MDS_SESSION_NEW || + session->s_state == CEPH_MDS_SESSION_CLOSING) + __open_session(mdsc, session); + list_add(&req->r_wait, &session->s_waiting); + goto out_session; + } + + /* send request */ + req->r_session = get_session(session); + req->r_resend_mds = -1; /* forget any previous mds hint */ + + if (req->r_request_started == 0) /* note request start time */ + req->r_request_started = jiffies; + + err = __prepare_send_request(mdsc, req, mds); + if (!err) { + ceph_msg_get(req->r_request); + ceph_con_send(&session->s_con, req->r_request); + } + +out_session: + ceph_put_mds_session(session); +out: + return err; + +finish: + req->r_reply = ERR_PTR(err); + complete_request(mdsc, req); + goto out; +} + +/* + * called under mdsc->mutex + */ +static void __wake_requests(struct ceph_mds_client *mdsc, + struct list_head *head) +{ + struct ceph_mds_request *req, *nreq; + + list_for_each_entry_safe(req, nreq, head, r_wait) { + list_del_init(&req->r_wait); + __do_request(mdsc, req); + } +} + +/* + * Wake up threads with requests pending for @mds, so that they can + * resubmit their requests to a possibly different mds. If @all is set, + * wake up if their requests has been forwarded to @mds, too. + */ +static void kick_requests(struct ceph_mds_client *mdsc, int mds, int all) +{ + struct ceph_mds_request *req; + struct rb_node *p; + + dout("kick_requests mds%d\n", mds); + for (p = rb_first(&mdsc->request_tree); p; p = rb_next(p)) { + req = rb_entry(p, struct ceph_mds_request, r_node); + if (req->r_got_unsafe) + continue; + if (req->r_session && + req->r_session->s_mds == mds) { + dout(" kicking tid %llu\n", req->r_tid); + put_request_session(req); + __do_request(mdsc, req); + } + } +} + +void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req) +{ + dout("submit_request on %p\n", req); + mutex_lock(&mdsc->mutex); + __register_request(mdsc, req, NULL); + __do_request(mdsc, req); + mutex_unlock(&mdsc->mutex); +} + +/* + * Synchrously perform an mds request. Take care of all of the + * session setup, forwarding, retry details. + */ +int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, + struct inode *dir, + struct ceph_mds_request *req) +{ + int err; + + dout("do_request on %p\n", req); + + /* take CAP_PIN refs for r_inode, r_locked_dir, r_old_dentry */ + if (req->r_inode) + ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN); + if (req->r_locked_dir) + ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN); + if (req->r_old_dentry) + ceph_get_cap_refs( + ceph_inode(req->r_old_dentry->d_parent->d_inode), + CEPH_CAP_PIN); + + /* issue */ + mutex_lock(&mdsc->mutex); + __register_request(mdsc, req, dir); + __do_request(mdsc, req); + + /* wait */ + if (!req->r_reply) { + mutex_unlock(&mdsc->mutex); + if (req->r_timeout) { + err = (long)wait_for_completion_interruptible_timeout( + &req->r_completion, req->r_timeout); + if (err == 0) + req->r_reply = ERR_PTR(-EIO); + else if (err < 0) + req->r_reply = ERR_PTR(err); + } else { + err = wait_for_completion_interruptible( + &req->r_completion); + if (err) + req->r_reply = ERR_PTR(err); + } + mutex_lock(&mdsc->mutex); + } + + if (IS_ERR(req->r_reply)) { + err = PTR_ERR(req->r_reply); + req->r_reply = NULL; + + if (err == -ERESTARTSYS) { + /* aborted */ + req->r_aborted = true; + + if (req->r_locked_dir && + (req->r_op & CEPH_MDS_OP_WRITE)) { + struct ceph_inode_info *ci = + ceph_inode(req->r_locked_dir); + + dout("aborted, clearing I_COMPLETE on %p\n", + req->r_locked_dir); + spin_lock(&req->r_locked_dir->i_lock); + ci->i_ceph_flags &= ~CEPH_I_COMPLETE; + ci->i_release_count++; + spin_unlock(&req->r_locked_dir->i_lock); + } + } else { + /* clean up this request */ + __unregister_request(mdsc, req); + if (!list_empty(&req->r_unsafe_item)) + list_del_init(&req->r_unsafe_item); + complete(&req->r_safe_completion); + } + } else if (req->r_err) { + err = req->r_err; + } else { + err = le32_to_cpu(req->r_reply_info.head->result); + } + mutex_unlock(&mdsc->mutex); + + dout("do_request %p done, result %d\n", req, err); + return err; +} + +/* + * Handle mds reply. + * + * We take the session mutex and parse and process the reply immediately. + * This preserves the logical ordering of replies, capabilities, etc., sent + * by the MDS as they are applied to our local cache. + */ +static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) +{ + struct ceph_mds_client *mdsc = session->s_mdsc; + struct ceph_mds_request *req; + struct ceph_mds_reply_head *head = msg->front.iov_base; + struct ceph_mds_reply_info_parsed *rinfo; /* parsed reply info */ + u64 tid; + int err, result; + int mds = session->s_mds; + + if (msg->front.iov_len < sizeof(*head)) { + pr_err("mdsc_handle_reply got corrupt (short) reply\n"); + ceph_msg_dump(msg); + return; + } + + /* get request, session */ + tid = le64_to_cpu(msg->hdr.tid); + mutex_lock(&mdsc->mutex); + req = __lookup_request(mdsc, tid); + if (!req) { + dout("handle_reply on unknown tid %llu\n", tid); + mutex_unlock(&mdsc->mutex); + return; + } + dout("handle_reply %p\n", req); + + /* correct session? */ + if (!req->r_session && req->r_session != session) { + pr_err("mdsc_handle_reply got %llu on session mds%d" + " not mds%d\n", tid, session->s_mds, + req->r_session ? req->r_session->s_mds : -1); + mutex_unlock(&mdsc->mutex); + goto out; + } + + /* dup? */ + if ((req->r_got_unsafe && !head->safe) || + (req->r_got_safe && head->safe)) { + pr_warning("got a dup %s reply on %llu from mds%d\n", + head->safe ? "safe" : "unsafe", tid, mds); + mutex_unlock(&mdsc->mutex); + goto out; + } + + result = le32_to_cpu(head->result); + + /* + * Tolerate 2 consecutive ESTALEs from the same mds. + * FIXME: we should be looking at the cap migrate_seq. + */ + if (result == -ESTALE) { + req->r_direct_mode = USE_AUTH_MDS; + req->r_num_stale++; + if (req->r_num_stale <= 2) { + __do_request(mdsc, req); + mutex_unlock(&mdsc->mutex); + goto out; + } + } else { + req->r_num_stale = 0; + } + + if (head->safe) { + req->r_got_safe = true; + __unregister_request(mdsc, req); + complete(&req->r_safe_completion); + + if (req->r_got_unsafe) { + /* + * We already handled the unsafe response, now do the + * cleanup. No need to examine the response; the MDS + * doesn't include any result info in the safe + * response. And even if it did, there is nothing + * useful we could do with a revised return value. + */ + dout("got safe reply %llu, mds%d\n", tid, mds); + list_del_init(&req->r_unsafe_item); + + /* last unsafe request during umount? */ + if (mdsc->stopping && !__get_oldest_req(mdsc)) + complete(&mdsc->safe_umount_waiters); + mutex_unlock(&mdsc->mutex); + goto out; + } + } + + BUG_ON(req->r_reply); + + if (!head->safe) { + req->r_got_unsafe = true; + list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe); + } + + dout("handle_reply tid %lld result %d\n", tid, result); + rinfo = &req->r_reply_info; + err = parse_reply_info(msg, rinfo); + mutex_unlock(&mdsc->mutex); + + mutex_lock(&session->s_mutex); + if (err < 0) { + pr_err("mdsc_handle_reply got corrupt reply mds%d\n", mds); + ceph_msg_dump(msg); + goto out_err; + } + + /* snap trace */ + if (rinfo->snapblob_len) { + down_write(&mdsc->snap_rwsem); + ceph_update_snap_trace(mdsc, rinfo->snapblob, + rinfo->snapblob + rinfo->snapblob_len, + le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP); + downgrade_write(&mdsc->snap_rwsem); + } else { + down_read(&mdsc->snap_rwsem); + } + + /* insert trace into our cache */ + err = ceph_fill_trace(mdsc->client->sb, req, req->r_session); + if (err == 0) { + if (result == 0 && rinfo->dir_nr) + ceph_readdir_prepopulate(req, req->r_session); + ceph_unreserve_caps(&req->r_caps_reservation); + } + + up_read(&mdsc->snap_rwsem); +out_err: + if (err) { + req->r_err = err; + } else { + req->r_reply = msg; + ceph_msg_get(msg); + } + + add_cap_releases(mdsc, req->r_session, -1); + mutex_unlock(&session->s_mutex); + + /* kick calling process */ + complete_request(mdsc, req); +out: + ceph_mdsc_put_request(req); + return; +} + + + +/* + * handle mds notification that our request has been forwarded. + */ +static void handle_forward(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, + struct ceph_msg *msg) +{ + struct ceph_mds_request *req; + u64 tid = le64_to_cpu(msg->hdr.tid); + u32 next_mds; + u32 fwd_seq; + int err = -EINVAL; + void *p = msg->front.iov_base; + void *end = p + msg->front.iov_len; + + ceph_decode_need(&p, end, 2*sizeof(u32), bad); + next_mds = ceph_decode_32(&p); + fwd_seq = ceph_decode_32(&p); + + mutex_lock(&mdsc->mutex); + req = __lookup_request(mdsc, tid); + if (!req) { + dout("forward %llu to mds%d - req dne\n", tid, next_mds); + goto out; /* dup reply? */ + } + + if (fwd_seq <= req->r_num_fwd) { + dout("forward %llu to mds%d - old seq %d <= %d\n", + tid, next_mds, req->r_num_fwd, fwd_seq); + } else { + /* resend. forward race not possible; mds would drop */ + dout("forward %llu to mds%d (we resend)\n", tid, next_mds); + req->r_num_fwd = fwd_seq; + req->r_resend_mds = next_mds; + put_request_session(req); + __do_request(mdsc, req); + } + ceph_mdsc_put_request(req); +out: + mutex_unlock(&mdsc->mutex); + return; + +bad: + pr_err("mdsc_handle_forward decode error err=%d\n", err); +} + +/* + * handle a mds session control message + */ +static void handle_session(struct ceph_mds_session *session, + struct ceph_msg *msg) +{ + struct ceph_mds_client *mdsc = session->s_mdsc; + u32 op; + u64 seq; + int mds = session->s_mds; + struct ceph_mds_session_head *h = msg->front.iov_base; + int wake = 0; + + /* decode */ + if (msg->front.iov_len != sizeof(*h)) + goto bad; + op = le32_to_cpu(h->op); + seq = le64_to_cpu(h->seq); + + mutex_lock(&mdsc->mutex); + if (op == CEPH_SESSION_CLOSE) + __unregister_session(mdsc, session); + /* FIXME: this ttl calculation is generous */ + session->s_ttl = jiffies + HZ*mdsc->mdsmap->m_session_autoclose; + mutex_unlock(&mdsc->mutex); + + mutex_lock(&session->s_mutex); + + dout("handle_session mds%d %s %p state %s seq %llu\n", + mds, ceph_session_op_name(op), session, + session_state_name(session->s_state), seq); + + if (session->s_state == CEPH_MDS_SESSION_HUNG) { + session->s_state = CEPH_MDS_SESSION_OPEN; + pr_info("mds%d came back\n", session->s_mds); + } + + switch (op) { + case CEPH_SESSION_OPEN: + session->s_state = CEPH_MDS_SESSION_OPEN; + renewed_caps(mdsc, session, 0); + wake = 1; + if (mdsc->stopping) + __close_session(mdsc, session); + break; + + case CEPH_SESSION_RENEWCAPS: + if (session->s_renew_seq == seq) + renewed_caps(mdsc, session, 1); + break; + + case CEPH_SESSION_CLOSE: + remove_session_caps(session); + wake = 1; /* for good measure */ + complete(&mdsc->session_close_waiters); + kick_requests(mdsc, mds, 0); /* cur only */ + break; + + case CEPH_SESSION_STALE: + pr_info("mds%d caps went stale, renewing\n", + session->s_mds); + spin_lock(&session->s_cap_lock); + session->s_cap_gen++; + session->s_cap_ttl = 0; + spin_unlock(&session->s_cap_lock); + send_renew_caps(mdsc, session); + break; + + case CEPH_SESSION_RECALL_STATE: + trim_caps(mdsc, session, le32_to_cpu(h->max_caps)); + break; + + default: + pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds); + WARN_ON(1); + } + + mutex_unlock(&session->s_mutex); + if (wake) { + mutex_lock(&mdsc->mutex); + __wake_requests(mdsc, &session->s_waiting); + mutex_unlock(&mdsc->mutex); + } + return; + +bad: + pr_err("mdsc_handle_session corrupt message mds%d len %d\n", mds, + (int)msg->front.iov_len); + ceph_msg_dump(msg); + return; +} + + +/* + * called under session->mutex. + */ +static void replay_unsafe_requests(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + struct ceph_mds_request *req, *nreq; + int err; + + dout("replay_unsafe_requests mds%d\n", session->s_mds); + + mutex_lock(&mdsc->mutex); + list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item) { + err = __prepare_send_request(mdsc, req, session->s_mds); + if (!err) { + ceph_msg_get(req->r_request); + ceph_con_send(&session->s_con, req->r_request); + } + } + mutex_unlock(&mdsc->mutex); +} + +/* + * Encode information about a cap for a reconnect with the MDS. + */ +static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, + void *arg) +{ + struct ceph_mds_cap_reconnect rec; + struct ceph_inode_info *ci; + struct ceph_pagelist *pagelist = arg; + char *path; + int pathlen, err; + u64 pathbase; + struct dentry *dentry; + + ci = cap->ci; + + dout(" adding %p ino %llx.%llx cap %p %lld %s\n", + inode, ceph_vinop(inode), cap, cap->cap_id, + ceph_cap_string(cap->issued)); + err = ceph_pagelist_encode_64(pagelist, ceph_ino(inode)); + if (err) + return err; + + dentry = d_find_alias(inode); + if (dentry) { + path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase, 0); + if (IS_ERR(path)) { + err = PTR_ERR(path); + BUG_ON(err); + } + } else { + path = NULL; + pathlen = 0; + } + err = ceph_pagelist_encode_string(pagelist, path, pathlen); + if (err) + goto out; + + spin_lock(&inode->i_lock); + cap->seq = 0; /* reset cap seq */ + cap->issue_seq = 0; /* and issue_seq */ + rec.cap_id = cpu_to_le64(cap->cap_id); + rec.pathbase = cpu_to_le64(pathbase); + rec.wanted = cpu_to_le32(__ceph_caps_wanted(ci)); + rec.issued = cpu_to_le32(cap->issued); + rec.size = cpu_to_le64(inode->i_size); + ceph_encode_timespec(&rec.mtime, &inode->i_mtime); + ceph_encode_timespec(&rec.atime, &inode->i_atime); + rec.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); + spin_unlock(&inode->i_lock); + + err = ceph_pagelist_append(pagelist, &rec, sizeof(rec)); + +out: + kfree(path); + dput(dentry); + return err; +} + + +/* + * If an MDS fails and recovers, clients need to reconnect in order to + * reestablish shared state. This includes all caps issued through + * this session _and_ the snap_realm hierarchy. Because it's not + * clear which snap realms the mds cares about, we send everything we + * know about.. that ensures we'll then get any new info the + * recovering MDS might have. + * + * This is a relatively heavyweight operation, but it's rare. + * + * called with mdsc->mutex held. + */ +static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds) +{ + struct ceph_mds_session *session = NULL; + struct ceph_msg *reply; + struct rb_node *p; + int err; + struct ceph_pagelist *pagelist; + + pr_info("reconnect to recovering mds%d\n", mds); + + pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS); + if (!pagelist) + goto fail_nopagelist; + ceph_pagelist_init(pagelist); + + reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, 0, 0, NULL); + if (IS_ERR(reply)) { + err = PTR_ERR(reply); + goto fail_nomsg; + } + + /* find session */ + session = __ceph_lookup_mds_session(mdsc, mds); + mutex_unlock(&mdsc->mutex); /* drop lock for duration */ + + if (session) { + mutex_lock(&session->s_mutex); + + session->s_state = CEPH_MDS_SESSION_RECONNECTING; + session->s_seq = 0; + + ceph_con_open(&session->s_con, + ceph_mdsmap_get_addr(mdsc->mdsmap, mds)); + + /* replay unsafe requests */ + replay_unsafe_requests(mdsc, session); + } else { + dout("no session for mds%d, will send short reconnect\n", + mds); + } + + down_read(&mdsc->snap_rwsem); + + if (!session) + goto send; + dout("session %p state %s\n", session, + session_state_name(session->s_state)); + + /* traverse this session's caps */ + err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps); + if (err) + goto fail; + err = iterate_session_caps(session, encode_caps_cb, pagelist); + if (err < 0) + goto out; + + /* + * snaprealms. we provide mds with the ino, seq (version), and + * parent for all of our realms. If the mds has any newer info, + * it will tell us. + */ + for (p = rb_first(&mdsc->snap_realms); p; p = rb_next(p)) { + struct ceph_snap_realm *realm = + rb_entry(p, struct ceph_snap_realm, node); + struct ceph_mds_snaprealm_reconnect sr_rec; + + dout(" adding snap realm %llx seq %lld parent %llx\n", + realm->ino, realm->seq, realm->parent_ino); + sr_rec.ino = cpu_to_le64(realm->ino); + sr_rec.seq = cpu_to_le64(realm->seq); + sr_rec.parent = cpu_to_le64(realm->parent_ino); + err = ceph_pagelist_append(pagelist, &sr_rec, sizeof(sr_rec)); + if (err) + goto fail; + } + +send: + reply->pagelist = pagelist; + reply->hdr.data_len = cpu_to_le32(pagelist->length); + reply->nr_pages = calc_pages_for(0, pagelist->length); + ceph_con_send(&session->s_con, reply); + + if (session) { + session->s_state = CEPH_MDS_SESSION_OPEN; + __wake_requests(mdsc, &session->s_waiting); + } + +out: + up_read(&mdsc->snap_rwsem); + if (session) { + mutex_unlock(&session->s_mutex); + ceph_put_mds_session(session); + } + mutex_lock(&mdsc->mutex); + return; + +fail: + ceph_msg_put(reply); +fail_nomsg: + ceph_pagelist_release(pagelist); + kfree(pagelist); +fail_nopagelist: + pr_err("ENOMEM preparing reconnect for mds%d\n", mds); + goto out; +} + + +/* + * compare old and new mdsmaps, kicking requests + * and closing out old connections as necessary + * + * called under mdsc->mutex. + */ +static void check_new_map(struct ceph_mds_client *mdsc, + struct ceph_mdsmap *newmap, + struct ceph_mdsmap *oldmap) +{ + int i; + int oldstate, newstate; + struct ceph_mds_session *s; + + dout("check_new_map new %u old %u\n", + newmap->m_epoch, oldmap->m_epoch); + + for (i = 0; i < oldmap->m_max_mds && i < mdsc->max_sessions; i++) { + if (mdsc->sessions[i] == NULL) + continue; + s = mdsc->sessions[i]; + oldstate = ceph_mdsmap_get_state(oldmap, i); + newstate = ceph_mdsmap_get_state(newmap, i); + + dout("check_new_map mds%d state %s -> %s (session %s)\n", + i, ceph_mds_state_name(oldstate), + ceph_mds_state_name(newstate), + session_state_name(s->s_state)); + + if (memcmp(ceph_mdsmap_get_addr(oldmap, i), + ceph_mdsmap_get_addr(newmap, i), + sizeof(struct ceph_entity_addr))) { + if (s->s_state == CEPH_MDS_SESSION_OPENING) { + /* the session never opened, just close it + * out now */ + __wake_requests(mdsc, &s->s_waiting); + __unregister_session(mdsc, s); + } else { + /* just close it */ + mutex_unlock(&mdsc->mutex); + mutex_lock(&s->s_mutex); + mutex_lock(&mdsc->mutex); + ceph_con_close(&s->s_con); + mutex_unlock(&s->s_mutex); + s->s_state = CEPH_MDS_SESSION_RESTARTING; + } + + /* kick any requests waiting on the recovering mds */ + kick_requests(mdsc, i, 1); + } else if (oldstate == newstate) { + continue; /* nothing new with this mds */ + } + + /* + * send reconnect? + */ + if (s->s_state == CEPH_MDS_SESSION_RESTARTING && + newstate >= CEPH_MDS_STATE_RECONNECT) + send_mds_reconnect(mdsc, i); + + /* + * kick requests on any mds that has gone active. + * + * kick requests on cur or forwarder: we may have sent + * the request to mds1, mds1 told us it forwarded it + * to mds2, but then we learn mds1 failed and can't be + * sure it successfully forwarded our request before + * it died. + */ + if (oldstate < CEPH_MDS_STATE_ACTIVE && + newstate >= CEPH_MDS_STATE_ACTIVE) { + pr_info("mds%d reconnect completed\n", s->s_mds); + kick_requests(mdsc, i, 1); + ceph_kick_flushing_caps(mdsc, s); + wake_up_session_caps(s, 1); + } + } +} + + + +/* + * leases + */ + +/* + * caller must hold session s_mutex, dentry->d_lock + */ +void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry) +{ + struct ceph_dentry_info *di = ceph_dentry(dentry); + + ceph_put_mds_session(di->lease_session); + di->lease_session = NULL; +} + +static void handle_lease(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, + struct ceph_msg *msg) +{ + struct super_block *sb = mdsc->client->sb; + struct inode *inode; + struct ceph_inode_info *ci; + struct dentry *parent, *dentry; + struct ceph_dentry_info *di; + int mds = session->s_mds; + struct ceph_mds_lease *h = msg->front.iov_base; + struct ceph_vino vino; + int mask; + struct qstr dname; + int release = 0; + + dout("handle_lease from mds%d\n", mds); + + /* decode */ + if (msg->front.iov_len < sizeof(*h) + sizeof(u32)) + goto bad; + vino.ino = le64_to_cpu(h->ino); + vino.snap = CEPH_NOSNAP; + mask = le16_to_cpu(h->mask); + dname.name = (void *)h + sizeof(*h) + sizeof(u32); + dname.len = msg->front.iov_len - sizeof(*h) - sizeof(u32); + if (dname.len != get_unaligned_le32(h+1)) + goto bad; + + mutex_lock(&session->s_mutex); + session->s_seq++; + + /* lookup inode */ + inode = ceph_find_inode(sb, vino); + dout("handle_lease '%s', mask %d, ino %llx %p\n", + ceph_lease_op_name(h->action), mask, vino.ino, inode); + if (inode == NULL) { + dout("handle_lease no inode %llx\n", vino.ino); + goto release; + } + ci = ceph_inode(inode); + + /* dentry */ + parent = d_find_alias(inode); + if (!parent) { + dout("no parent dentry on inode %p\n", inode); + WARN_ON(1); + goto release; /* hrm... */ + } + dname.hash = full_name_hash(dname.name, dname.len); + dentry = d_lookup(parent, &dname); + dput(parent); + if (!dentry) + goto release; + + spin_lock(&dentry->d_lock); + di = ceph_dentry(dentry); + switch (h->action) { + case CEPH_MDS_LEASE_REVOKE: + if (di && di->lease_session == session) { + h->seq = cpu_to_le32(di->lease_seq); + __ceph_mdsc_drop_dentry_lease(dentry); + } + release = 1; + break; + + case CEPH_MDS_LEASE_RENEW: + if (di && di->lease_session == session && + di->lease_gen == session->s_cap_gen && + di->lease_renew_from && + di->lease_renew_after == 0) { + unsigned long duration = + le32_to_cpu(h->duration_ms) * HZ / 1000; + + di->lease_seq = le32_to_cpu(h->seq); + dentry->d_time = di->lease_renew_from + duration; + di->lease_renew_after = di->lease_renew_from + + (duration >> 1); + di->lease_renew_from = 0; + } + break; + } + spin_unlock(&dentry->d_lock); + dput(dentry); + + if (!release) + goto out; + +release: + /* let's just reuse the same message */ + h->action = CEPH_MDS_LEASE_REVOKE_ACK; + ceph_msg_get(msg); + ceph_con_send(&session->s_con, msg); + +out: + iput(inode); + mutex_unlock(&session->s_mutex); + return; + +bad: + pr_err("corrupt lease message\n"); + ceph_msg_dump(msg); +} + +void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, + struct inode *inode, + struct dentry *dentry, char action, + u32 seq) +{ + struct ceph_msg *msg; + struct ceph_mds_lease *lease; + int len = sizeof(*lease) + sizeof(u32); + int dnamelen = 0; + + dout("lease_send_msg inode %p dentry %p %s to mds%d\n", + inode, dentry, ceph_lease_op_name(action), session->s_mds); + dnamelen = dentry->d_name.len; + len += dnamelen; + + msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, 0, 0, NULL); + if (IS_ERR(msg)) + return; + lease = msg->front.iov_base; + lease->action = action; + lease->mask = cpu_to_le16(CEPH_LOCK_DN); + lease->ino = cpu_to_le64(ceph_vino(inode).ino); + lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap); + lease->seq = cpu_to_le32(seq); + put_unaligned_le32(dnamelen, lease + 1); + memcpy((void *)(lease + 1) + 4, dentry->d_name.name, dnamelen); + + /* + * if this is a preemptive lease RELEASE, no need to + * flush request stream, since the actual request will + * soon follow. + */ + msg->more_to_follow = (action == CEPH_MDS_LEASE_RELEASE); + + ceph_con_send(&session->s_con, msg); +} + +/* + * Preemptively release a lease we expect to invalidate anyway. + * Pass @inode always, @dentry is optional. + */ +void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode, + struct dentry *dentry, int mask) +{ + struct ceph_dentry_info *di; + struct ceph_mds_session *session; + u32 seq; + + BUG_ON(inode == NULL); + BUG_ON(dentry == NULL); + BUG_ON(mask != CEPH_LOCK_DN); + + /* is dentry lease valid? */ + spin_lock(&dentry->d_lock); + di = ceph_dentry(dentry); + if (!di || !di->lease_session || + di->lease_session->s_mds < 0 || + di->lease_gen != di->lease_session->s_cap_gen || + !time_before(jiffies, dentry->d_time)) { + dout("lease_release inode %p dentry %p -- " + "no lease on %d\n", + inode, dentry, mask); + spin_unlock(&dentry->d_lock); + return; + } + + /* we do have a lease on this dentry; note mds and seq */ + session = ceph_get_mds_session(di->lease_session); + seq = di->lease_seq; + __ceph_mdsc_drop_dentry_lease(dentry); + spin_unlock(&dentry->d_lock); + + dout("lease_release inode %p dentry %p mask %d to mds%d\n", + inode, dentry, mask, session->s_mds); + ceph_mdsc_lease_send_msg(session, inode, dentry, + CEPH_MDS_LEASE_RELEASE, seq); + ceph_put_mds_session(session); +} + +/* + * drop all leases (and dentry refs) in preparation for umount + */ +static void drop_leases(struct ceph_mds_client *mdsc) +{ + int i; + + dout("drop_leases\n"); + mutex_lock(&mdsc->mutex); + for (i = 0; i < mdsc->max_sessions; i++) { + struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i); + if (!s) + continue; + mutex_unlock(&mdsc->mutex); + mutex_lock(&s->s_mutex); + mutex_unlock(&s->s_mutex); + ceph_put_mds_session(s); + mutex_lock(&mdsc->mutex); + } + mutex_unlock(&mdsc->mutex); +} + + + +/* + * delayed work -- periodically trim expired leases, renew caps with mds + */ +static void schedule_delayed(struct ceph_mds_client *mdsc) +{ + int delay = 5; + unsigned hz = round_jiffies_relative(HZ * delay); + schedule_delayed_work(&mdsc->delayed_work, hz); +} + +static void delayed_work(struct work_struct *work) +{ + int i; + struct ceph_mds_client *mdsc = + container_of(work, struct ceph_mds_client, delayed_work.work); + int renew_interval; + int renew_caps; + + dout("mdsc delayed_work\n"); + ceph_check_delayed_caps(mdsc); + + mutex_lock(&mdsc->mutex); + renew_interval = mdsc->mdsmap->m_session_timeout >> 2; + renew_caps = time_after_eq(jiffies, HZ*renew_interval + + mdsc->last_renew_caps); + if (renew_caps) + mdsc->last_renew_caps = jiffies; + + for (i = 0; i < mdsc->max_sessions; i++) { + struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i); + if (s == NULL) + continue; + if (s->s_state == CEPH_MDS_SESSION_CLOSING) { + dout("resending session close request for mds%d\n", + s->s_mds); + request_close_session(mdsc, s); + ceph_put_mds_session(s); + continue; + } + if (s->s_ttl && time_after(jiffies, s->s_ttl)) { + if (s->s_state == CEPH_MDS_SESSION_OPEN) { + s->s_state = CEPH_MDS_SESSION_HUNG; + pr_info("mds%d hung\n", s->s_mds); + } + } + if (s->s_state < CEPH_MDS_SESSION_OPEN) { + /* this mds is failed or recovering, just wait */ + ceph_put_mds_session(s); + continue; + } + mutex_unlock(&mdsc->mutex); + + mutex_lock(&s->s_mutex); + if (renew_caps) + send_renew_caps(mdsc, s); + else + ceph_con_keepalive(&s->s_con); + add_cap_releases(mdsc, s, -1); + send_cap_releases(mdsc, s); + mutex_unlock(&s->s_mutex); + ceph_put_mds_session(s); + + mutex_lock(&mdsc->mutex); + } + mutex_unlock(&mdsc->mutex); + + schedule_delayed(mdsc); +} + + +int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client) +{ + mdsc->client = client; + mutex_init(&mdsc->mutex); + mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS); + init_completion(&mdsc->safe_umount_waiters); + init_completion(&mdsc->session_close_waiters); + INIT_LIST_HEAD(&mdsc->waiting_for_map); + mdsc->sessions = NULL; + mdsc->max_sessions = 0; + mdsc->stopping = 0; + init_rwsem(&mdsc->snap_rwsem); + mdsc->snap_realms = RB_ROOT; + INIT_LIST_HEAD(&mdsc->snap_empty); + spin_lock_init(&mdsc->snap_empty_lock); + mdsc->last_tid = 0; + mdsc->request_tree = RB_ROOT; + INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work); + mdsc->last_renew_caps = jiffies; + INIT_LIST_HEAD(&mdsc->cap_delay_list); + spin_lock_init(&mdsc->cap_delay_lock); + INIT_LIST_HEAD(&mdsc->snap_flush_list); + spin_lock_init(&mdsc->snap_flush_lock); + mdsc->cap_flush_seq = 0; + INIT_LIST_HEAD(&mdsc->cap_dirty); + mdsc->num_cap_flushing = 0; + spin_lock_init(&mdsc->cap_dirty_lock); + init_waitqueue_head(&mdsc->cap_flushing_wq); + spin_lock_init(&mdsc->dentry_lru_lock); + INIT_LIST_HEAD(&mdsc->dentry_lru); + return 0; +} + +/* + * Wait for safe replies on open mds requests. If we time out, drop + * all requests from the tree to avoid dangling dentry refs. + */ +static void wait_requests(struct ceph_mds_client *mdsc) +{ + struct ceph_mds_request *req; + struct ceph_client *client = mdsc->client; + + mutex_lock(&mdsc->mutex); + if (__get_oldest_req(mdsc)) { + mutex_unlock(&mdsc->mutex); + + dout("wait_requests waiting for requests\n"); + wait_for_completion_timeout(&mdsc->safe_umount_waiters, + client->mount_args->mount_timeout * HZ); + + /* tear down remaining requests */ + mutex_lock(&mdsc->mutex); + while ((req = __get_oldest_req(mdsc))) { + dout("wait_requests timed out on tid %llu\n", + req->r_tid); + __unregister_request(mdsc, req); + } + } + mutex_unlock(&mdsc->mutex); + dout("wait_requests done\n"); +} + +/* + * called before mount is ro, and before dentries are torn down. + * (hmm, does this still race with new lookups?) + */ +void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc) +{ + dout("pre_umount\n"); + mdsc->stopping = 1; + + drop_leases(mdsc); + ceph_flush_dirty_caps(mdsc); + wait_requests(mdsc); +} + +/* + * wait for all write mds requests to flush. + */ +static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid) +{ + struct ceph_mds_request *req = NULL; + struct rb_node *n; + + mutex_lock(&mdsc->mutex); + dout("wait_unsafe_requests want %lld\n", want_tid); + req = __get_oldest_req(mdsc); + while (req && req->r_tid <= want_tid) { + if ((req->r_op & CEPH_MDS_OP_WRITE)) { + /* write op */ + ceph_mdsc_get_request(req); + mutex_unlock(&mdsc->mutex); + dout("wait_unsafe_requests wait on %llu (want %llu)\n", + req->r_tid, want_tid); + wait_for_completion(&req->r_safe_completion); + mutex_lock(&mdsc->mutex); + n = rb_next(&req->r_node); + ceph_mdsc_put_request(req); + } else { + n = rb_next(&req->r_node); + } + if (!n) + break; + req = rb_entry(n, struct ceph_mds_request, r_node); + } + mutex_unlock(&mdsc->mutex); + dout("wait_unsafe_requests done\n"); +} + +void ceph_mdsc_sync(struct ceph_mds_client *mdsc) +{ + u64 want_tid, want_flush; + + dout("sync\n"); + mutex_lock(&mdsc->mutex); + want_tid = mdsc->last_tid; + want_flush = mdsc->cap_flush_seq; + mutex_unlock(&mdsc->mutex); + dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush); + + ceph_flush_dirty_caps(mdsc); + + wait_unsafe_requests(mdsc, want_tid); + wait_event(mdsc->cap_flushing_wq, check_cap_flush(mdsc, want_flush)); +} + + +/* + * called after sb is ro. + */ +void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) +{ + struct ceph_mds_session *session; + int i; + int n; + struct ceph_client *client = mdsc->client; + unsigned long started, timeout = client->mount_args->mount_timeout * HZ; + + dout("close_sessions\n"); + + mutex_lock(&mdsc->mutex); + + /* close sessions */ + started = jiffies; + while (time_before(jiffies, started + timeout)) { + dout("closing sessions\n"); + n = 0; + for (i = 0; i < mdsc->max_sessions; i++) { + session = __ceph_lookup_mds_session(mdsc, i); + if (!session) + continue; + mutex_unlock(&mdsc->mutex); + mutex_lock(&session->s_mutex); + __close_session(mdsc, session); + mutex_unlock(&session->s_mutex); + ceph_put_mds_session(session); + mutex_lock(&mdsc->mutex); + n++; + } + if (n == 0) + break; + + if (client->mount_state == CEPH_MOUNT_SHUTDOWN) + break; + + dout("waiting for sessions to close\n"); + mutex_unlock(&mdsc->mutex); + wait_for_completion_timeout(&mdsc->session_close_waiters, + timeout); + mutex_lock(&mdsc->mutex); + } + + /* tear down remaining sessions */ + for (i = 0; i < mdsc->max_sessions; i++) { + if (mdsc->sessions[i]) { + session = get_session(mdsc->sessions[i]); + __unregister_session(mdsc, session); + mutex_unlock(&mdsc->mutex); + mutex_lock(&session->s_mutex); + remove_session_caps(session); + mutex_unlock(&session->s_mutex); + ceph_put_mds_session(session); + mutex_lock(&mdsc->mutex); + } + } + + WARN_ON(!list_empty(&mdsc->cap_delay_list)); + + mutex_unlock(&mdsc->mutex); + + ceph_cleanup_empty_realms(mdsc); + + cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */ + + dout("stopped\n"); +} + +void ceph_mdsc_stop(struct ceph_mds_client *mdsc) +{ + dout("stop\n"); + cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */ + if (mdsc->mdsmap) + ceph_mdsmap_destroy(mdsc->mdsmap); + kfree(mdsc->sessions); +} + + +/* + * handle mds map update. + */ +void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg) +{ + u32 epoch; + u32 maplen; + void *p = msg->front.iov_base; + void *end = p + msg->front.iov_len; + struct ceph_mdsmap *newmap, *oldmap; + struct ceph_fsid fsid; + int err = -EINVAL; + + ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad); + ceph_decode_copy(&p, &fsid, sizeof(fsid)); + if (ceph_check_fsid(mdsc->client, &fsid) < 0) + return; + epoch = ceph_decode_32(&p); + maplen = ceph_decode_32(&p); + dout("handle_map epoch %u len %d\n", epoch, (int)maplen); + + /* do we need it? */ + ceph_monc_got_mdsmap(&mdsc->client->monc, epoch); + mutex_lock(&mdsc->mutex); + if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) { + dout("handle_map epoch %u <= our %u\n", + epoch, mdsc->mdsmap->m_epoch); + mutex_unlock(&mdsc->mutex); + return; + } + + newmap = ceph_mdsmap_decode(&p, end); + if (IS_ERR(newmap)) { + err = PTR_ERR(newmap); + goto bad_unlock; + } + + /* swap into place */ + if (mdsc->mdsmap) { + oldmap = mdsc->mdsmap; + mdsc->mdsmap = newmap; + check_new_map(mdsc, newmap, oldmap); + ceph_mdsmap_destroy(oldmap); + } else { + mdsc->mdsmap = newmap; /* first mds map */ + } + mdsc->client->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size; + + __wake_requests(mdsc, &mdsc->waiting_for_map); + + mutex_unlock(&mdsc->mutex); + schedule_delayed(mdsc); + return; + +bad_unlock: + mutex_unlock(&mdsc->mutex); +bad: + pr_err("error decoding mdsmap %d\n", err); + return; +} + +static struct ceph_connection *con_get(struct ceph_connection *con) +{ + struct ceph_mds_session *s = con->private; + + if (get_session(s)) { + dout("mdsc con_get %p ok (%d)\n", s, atomic_read(&s->s_ref)); + return con; + } + dout("mdsc con_get %p FAIL\n", s); + return NULL; +} + +static void con_put(struct ceph_connection *con) +{ + struct ceph_mds_session *s = con->private; + + ceph_put_mds_session(s); + dout("mdsc con_put %p (%d)\n", s, atomic_read(&s->s_ref)); +} + +/* + * if the client is unresponsive for long enough, the mds will kill + * the session entirely. + */ +static void peer_reset(struct ceph_connection *con) +{ + struct ceph_mds_session *s = con->private; + + pr_err("mds%d gave us the boot. IMPLEMENT RECONNECT.\n", + s->s_mds); +} + +static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) +{ + struct ceph_mds_session *s = con->private; + struct ceph_mds_client *mdsc = s->s_mdsc; + int type = le16_to_cpu(msg->hdr.type); + + mutex_lock(&mdsc->mutex); + if (__verify_registered_session(mdsc, s) < 0) { + mutex_unlock(&mdsc->mutex); + goto out; + } + mutex_unlock(&mdsc->mutex); + + switch (type) { + case CEPH_MSG_MDS_MAP: + ceph_mdsc_handle_map(mdsc, msg); + break; + case CEPH_MSG_CLIENT_SESSION: + handle_session(s, msg); + break; + case CEPH_MSG_CLIENT_REPLY: + handle_reply(s, msg); + break; + case CEPH_MSG_CLIENT_REQUEST_FORWARD: + handle_forward(mdsc, s, msg); + break; + case CEPH_MSG_CLIENT_CAPS: + ceph_handle_caps(s, msg); + break; + case CEPH_MSG_CLIENT_SNAP: + ceph_handle_snap(mdsc, s, msg); + break; + case CEPH_MSG_CLIENT_LEASE: + handle_lease(mdsc, s, msg); + break; + + default: + pr_err("received unknown message type %d %s\n", type, + ceph_msg_type_name(type)); + } +out: + ceph_msg_put(msg); +} + +/* + * authentication + */ +static int get_authorizer(struct ceph_connection *con, + void **buf, int *len, int *proto, + void **reply_buf, int *reply_len, int force_new) +{ + struct ceph_mds_session *s = con->private; + struct ceph_mds_client *mdsc = s->s_mdsc; + struct ceph_auth_client *ac = mdsc->client->monc.auth; + int ret = 0; + + if (force_new && s->s_authorizer) { + ac->ops->destroy_authorizer(ac, s->s_authorizer); + s->s_authorizer = NULL; + } + if (s->s_authorizer == NULL) { + if (ac->ops->create_authorizer) { + ret = ac->ops->create_authorizer( + ac, CEPH_ENTITY_TYPE_MDS, + &s->s_authorizer, + &s->s_authorizer_buf, + &s->s_authorizer_buf_len, + &s->s_authorizer_reply_buf, + &s->s_authorizer_reply_buf_len); + if (ret) + return ret; + } + } + + *proto = ac->protocol; + *buf = s->s_authorizer_buf; + *len = s->s_authorizer_buf_len; + *reply_buf = s->s_authorizer_reply_buf; + *reply_len = s->s_authorizer_reply_buf_len; + return 0; +} + + +static int verify_authorizer_reply(struct ceph_connection *con, int len) +{ + struct ceph_mds_session *s = con->private; + struct ceph_mds_client *mdsc = s->s_mdsc; + struct ceph_auth_client *ac = mdsc->client->monc.auth; + + return ac->ops->verify_authorizer_reply(ac, s->s_authorizer, len); +} + +static int invalidate_authorizer(struct ceph_connection *con) +{ + struct ceph_mds_session *s = con->private; + struct ceph_mds_client *mdsc = s->s_mdsc; + struct ceph_auth_client *ac = mdsc->client->monc.auth; + + if (ac->ops->invalidate_authorizer) + ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS); + + return ceph_monc_validate_auth(&mdsc->client->monc); +} + +const static struct ceph_connection_operations mds_con_ops = { + .get = con_get, + .put = con_put, + .dispatch = dispatch, + .get_authorizer = get_authorizer, + .verify_authorizer_reply = verify_authorizer_reply, + .invalidate_authorizer = invalidate_authorizer, + .peer_reset = peer_reset, +}; + + + + +/* eof */ diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h new file mode 100644 index 00000000000..961cc6f6587 --- /dev/null +++ b/fs/ceph/mds_client.h @@ -0,0 +1,335 @@ +#ifndef _FS_CEPH_MDS_CLIENT_H +#define _FS_CEPH_MDS_CLIENT_H + +#include <linux/completion.h> +#include <linux/kref.h> +#include <linux/list.h> +#include <linux/mutex.h> +#include <linux/rbtree.h> +#include <linux/spinlock.h> + +#include "types.h" +#include "messenger.h" +#include "mdsmap.h" + +/* + * Some lock dependencies: + * + * session->s_mutex + * mdsc->mutex + * + * mdsc->snap_rwsem + * + * inode->i_lock + * mdsc->snap_flush_lock + * mdsc->cap_delay_lock + * + */ + +struct ceph_client; +struct ceph_cap; + +/* + * parsed info about a single inode. pointers are into the encoded + * on-wire structures within the mds reply message payload. + */ +struct ceph_mds_reply_info_in { + struct ceph_mds_reply_inode *in; + u32 symlink_len; + char *symlink; + u32 xattr_len; + char *xattr_data; +}; + +/* + * parsed info about an mds reply, including information about the + * target inode and/or its parent directory and dentry, and directory + * contents (for readdir results). + */ +struct ceph_mds_reply_info_parsed { + struct ceph_mds_reply_head *head; + + struct ceph_mds_reply_info_in diri, targeti; + struct ceph_mds_reply_dirfrag *dirfrag; + char *dname; + u32 dname_len; + struct ceph_mds_reply_lease *dlease; + + struct ceph_mds_reply_dirfrag *dir_dir; + int dir_nr; + char **dir_dname; + u32 *dir_dname_len; + struct ceph_mds_reply_lease **dir_dlease; + struct ceph_mds_reply_info_in *dir_in; + u8 dir_complete, dir_end; + + /* encoded blob describing snapshot contexts for certain + operations (e.g., open) */ + void *snapblob; + int snapblob_len; +}; + + +/* + * cap releases are batched and sent to the MDS en masse. + */ +#define CEPH_CAPS_PER_RELEASE ((PAGE_CACHE_SIZE - \ + sizeof(struct ceph_mds_cap_release)) / \ + sizeof(struct ceph_mds_cap_item)) + + +/* + * state associated with each MDS<->client session + */ +enum { + CEPH_MDS_SESSION_NEW = 1, + CEPH_MDS_SESSION_OPENING = 2, + CEPH_MDS_SESSION_OPEN = 3, + CEPH_MDS_SESSION_HUNG = 4, + CEPH_MDS_SESSION_CLOSING = 5, + CEPH_MDS_SESSION_RESTARTING = 6, + CEPH_MDS_SESSION_RECONNECTING = 7, +}; + +struct ceph_mds_session { + struct ceph_mds_client *s_mdsc; + int s_mds; + int s_state; + unsigned long s_ttl; /* time until mds kills us */ + u64 s_seq; /* incoming msg seq # */ + struct mutex s_mutex; /* serialize session messages */ + + struct ceph_connection s_con; + + struct ceph_authorizer *s_authorizer; + void *s_authorizer_buf, *s_authorizer_reply_buf; + size_t s_authorizer_buf_len, s_authorizer_reply_buf_len; + + /* protected by s_cap_lock */ + spinlock_t s_cap_lock; + u32 s_cap_gen; /* inc each time we get mds stale msg */ + unsigned long s_cap_ttl; /* when session caps expire */ + struct list_head s_caps; /* all caps issued by this session */ + int s_nr_caps, s_trim_caps; + int s_num_cap_releases; + struct list_head s_cap_releases; /* waiting cap_release messages */ + struct list_head s_cap_releases_done; /* ready to send */ + struct ceph_cap *s_cap_iterator; + + /* protected by mutex */ + struct list_head s_cap_flushing; /* inodes w/ flushing caps */ + struct list_head s_cap_snaps_flushing; + unsigned long s_renew_requested; /* last time we sent a renew req */ + u64 s_renew_seq; + + atomic_t s_ref; + struct list_head s_waiting; /* waiting requests */ + struct list_head s_unsafe; /* unsafe requests */ +}; + +/* + * modes of choosing which MDS to send a request to + */ +enum { + USE_ANY_MDS, + USE_RANDOM_MDS, + USE_AUTH_MDS, /* prefer authoritative mds for this metadata item */ +}; + +struct ceph_mds_request; +struct ceph_mds_client; + +/* + * request completion callback + */ +typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc, + struct ceph_mds_request *req); + +/* + * an in-flight mds request + */ +struct ceph_mds_request { + u64 r_tid; /* transaction id */ + struct rb_node r_node; + + int r_op; /* mds op code */ + int r_mds; + + /* operation on what? */ + struct inode *r_inode; /* arg1 */ + struct dentry *r_dentry; /* arg1 */ + struct dentry *r_old_dentry; /* arg2: rename from or link from */ + char *r_path1, *r_path2; + struct ceph_vino r_ino1, r_ino2; + + struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */ + struct inode *r_target_inode; /* resulting inode */ + + union ceph_mds_request_args r_args; + int r_fmode; /* file mode, if expecting cap */ + + /* for choosing which mds to send this request to */ + int r_direct_mode; + u32 r_direct_hash; /* choose dir frag based on this dentry hash */ + bool r_direct_is_hash; /* true if r_direct_hash is valid */ + + /* data payload is used for xattr ops */ + struct page **r_pages; + int r_num_pages; + int r_data_len; + + /* what caps shall we drop? */ + int r_inode_drop, r_inode_unless; + int r_dentry_drop, r_dentry_unless; + int r_old_dentry_drop, r_old_dentry_unless; + struct inode *r_old_inode; + int r_old_inode_drop, r_old_inode_unless; + + struct ceph_msg *r_request; /* original request */ + struct ceph_msg *r_reply; + struct ceph_mds_reply_info_parsed r_reply_info; + int r_err; + bool r_aborted; + + unsigned long r_timeout; /* optional. jiffies */ + unsigned long r_started; /* start time to measure timeout against */ + unsigned long r_request_started; /* start time for mds request only, + used to measure lease durations */ + + /* link unsafe requests to parent directory, for fsync */ + struct inode *r_unsafe_dir; + struct list_head r_unsafe_dir_item; + + struct ceph_mds_session *r_session; + + int r_attempts; /* resend attempts */ + int r_num_fwd; /* number of forward attempts */ + int r_num_stale; + int r_resend_mds; /* mds to resend to next, if any*/ + + struct kref r_kref; + struct list_head r_wait; + struct completion r_completion; + struct completion r_safe_completion; + ceph_mds_request_callback_t r_callback; + struct list_head r_unsafe_item; /* per-session unsafe list item */ + bool r_got_unsafe, r_got_safe; + + bool r_did_prepopulate; + u32 r_readdir_offset; + + struct ceph_cap_reservation r_caps_reservation; + int r_num_caps; +}; + +/* + * mds client state + */ +struct ceph_mds_client { + struct ceph_client *client; + struct mutex mutex; /* all nested structures */ + + struct ceph_mdsmap *mdsmap; + struct completion safe_umount_waiters, session_close_waiters; + struct list_head waiting_for_map; + + struct ceph_mds_session **sessions; /* NULL for mds if no session */ + int max_sessions; /* len of s_mds_sessions */ + int stopping; /* true if shutting down */ + + /* + * snap_rwsem will cover cap linkage into snaprealms, and + * realm snap contexts. (later, we can do per-realm snap + * contexts locks..) the empty list contains realms with no + * references (implying they contain no inodes with caps) that + * should be destroyed. + */ + struct rw_semaphore snap_rwsem; + struct rb_root snap_realms; + struct list_head snap_empty; + spinlock_t snap_empty_lock; /* protect snap_empty */ + + u64 last_tid; /* most recent mds request */ + struct rb_root request_tree; /* pending mds requests */ + struct delayed_work delayed_work; /* delayed work */ + unsigned long last_renew_caps; /* last time we renewed our caps */ + struct list_head cap_delay_list; /* caps with delayed release */ + spinlock_t cap_delay_lock; /* protects cap_delay_list */ + struct list_head snap_flush_list; /* cap_snaps ready to flush */ + spinlock_t snap_flush_lock; + + u64 cap_flush_seq; + struct list_head cap_dirty; /* inodes with dirty caps */ + int num_cap_flushing; /* # caps we are flushing */ + spinlock_t cap_dirty_lock; /* protects above items */ + wait_queue_head_t cap_flushing_wq; + +#ifdef CONFIG_DEBUG_FS + struct dentry *debugfs_file; +#endif + + spinlock_t dentry_lru_lock; + struct list_head dentry_lru; + int num_dentry; +}; + +extern const char *ceph_mds_op_name(int op); + +extern struct ceph_mds_session * +__ceph_lookup_mds_session(struct ceph_mds_client *, int mds); + +static inline struct ceph_mds_session * +ceph_get_mds_session(struct ceph_mds_session *s) +{ + atomic_inc(&s->s_ref); + return s; +} + +extern void ceph_put_mds_session(struct ceph_mds_session *s); + +extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc, + struct ceph_msg *msg, int mds); + +extern int ceph_mdsc_init(struct ceph_mds_client *mdsc, + struct ceph_client *client); +extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc); +extern void ceph_mdsc_stop(struct ceph_mds_client *mdsc); + +extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc); + +extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, + struct inode *inode, + struct dentry *dn, int mask); + +extern struct ceph_mds_request * +ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode); +extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req); +extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, + struct inode *dir, + struct ceph_mds_request *req); +static inline void ceph_mdsc_get_request(struct ceph_mds_request *req) +{ + kref_get(&req->r_kref); +} +extern void ceph_mdsc_release_request(struct kref *kref); +static inline void ceph_mdsc_put_request(struct ceph_mds_request *req) +{ + kref_put(&req->r_kref, ceph_mdsc_release_request); +} + +extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc); + +extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base, + int stop_on_nosnap); + +extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry); +extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, + struct inode *inode, + struct dentry *dentry, char action, + u32 seq); + +extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, + struct ceph_msg *msg); + +#endif diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c new file mode 100644 index 00000000000..c4c498e6dfe --- /dev/null +++ b/fs/ceph/mdsmap.c @@ -0,0 +1,174 @@ +#include "ceph_debug.h" + +#include <linux/bug.h> +#include <linux/err.h> +#include <linux/random.h> +#include <linux/slab.h> +#include <linux/types.h> + +#include "mdsmap.h" +#include "messenger.h" +#include "decode.h" + +#include "super.h" + + +/* + * choose a random mds that is "up" (i.e. has a state > 0), or -1. + */ +int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m) +{ + int n = 0; + int i; + char r; + + /* count */ + for (i = 0; i < m->m_max_mds; i++) + if (m->m_info[i].state > 0) + n++; + if (n == 0) + return -1; + + /* pick */ + get_random_bytes(&r, 1); + n = r % n; + i = 0; + for (i = 0; n > 0; i++, n--) + while (m->m_info[i].state <= 0) + i++; + + return i; +} + +/* + * Decode an MDS map + * + * Ignore any fields we don't care about (there are quite a few of + * them). + */ +struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) +{ + struct ceph_mdsmap *m; + const void *start = *p; + int i, j, n; + int err = -EINVAL; + u16 version; + + m = kzalloc(sizeof(*m), GFP_NOFS); + if (m == NULL) + return ERR_PTR(-ENOMEM); + + ceph_decode_16_safe(p, end, version, bad); + + ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad); + m->m_epoch = ceph_decode_32(p); + m->m_client_epoch = ceph_decode_32(p); + m->m_last_failure = ceph_decode_32(p); + m->m_root = ceph_decode_32(p); + m->m_session_timeout = ceph_decode_32(p); + m->m_session_autoclose = ceph_decode_32(p); + m->m_max_file_size = ceph_decode_64(p); + m->m_max_mds = ceph_decode_32(p); + + m->m_info = kcalloc(m->m_max_mds, sizeof(*m->m_info), GFP_NOFS); + if (m->m_info == NULL) + goto badmem; + + /* pick out active nodes from mds_info (state > 0) */ + n = ceph_decode_32(p); + for (i = 0; i < n; i++) { + u64 global_id; + u32 namelen; + s32 mds, inc, state; + u64 state_seq; + u8 infoversion; + struct ceph_entity_addr addr; + u32 num_export_targets; + void *pexport_targets = NULL; + + ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad); + global_id = ceph_decode_64(p); + infoversion = ceph_decode_8(p); + *p += sizeof(u64); + namelen = ceph_decode_32(p); /* skip mds name */ + *p += namelen; + + ceph_decode_need(p, end, + 4*sizeof(u32) + sizeof(u64) + + sizeof(addr) + sizeof(struct ceph_timespec), + bad); + mds = ceph_decode_32(p); + inc = ceph_decode_32(p); + state = ceph_decode_32(p); + state_seq = ceph_decode_64(p); + ceph_decode_copy(p, &addr, sizeof(addr)); + ceph_decode_addr(&addr); + *p += sizeof(struct ceph_timespec); + *p += sizeof(u32); + ceph_decode_32_safe(p, end, namelen, bad); + *p += namelen; + if (infoversion >= 2) { + ceph_decode_32_safe(p, end, num_export_targets, bad); + pexport_targets = *p; + *p += num_export_targets * sizeof(u32); + } else { + num_export_targets = 0; + } + + dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n", + i+1, n, global_id, mds, inc, pr_addr(&addr.in_addr), + ceph_mds_state_name(state)); + if (mds >= 0 && mds < m->m_max_mds && state > 0) { + m->m_info[mds].global_id = global_id; + m->m_info[mds].state = state; + m->m_info[mds].addr = addr; + m->m_info[mds].num_export_targets = num_export_targets; + if (num_export_targets) { + m->m_info[mds].export_targets = + kcalloc(num_export_targets, sizeof(u32), + GFP_NOFS); + for (j = 0; j < num_export_targets; j++) + m->m_info[mds].export_targets[j] = + ceph_decode_32(&pexport_targets); + } else { + m->m_info[mds].export_targets = NULL; + } + } + } + + /* pg_pools */ + ceph_decode_32_safe(p, end, n, bad); + m->m_num_data_pg_pools = n; + m->m_data_pg_pools = kcalloc(n, sizeof(u32), GFP_NOFS); + if (!m->m_data_pg_pools) + goto badmem; + ceph_decode_need(p, end, sizeof(u32)*(n+1), bad); + for (i = 0; i < n; i++) + m->m_data_pg_pools[i] = ceph_decode_32(p); + m->m_cas_pg_pool = ceph_decode_32(p); + + /* ok, we don't care about the rest. */ + dout("mdsmap_decode success epoch %u\n", m->m_epoch); + return m; + +badmem: + err = -ENOMEM; +bad: + pr_err("corrupt mdsmap\n"); + print_hex_dump(KERN_DEBUG, "mdsmap: ", + DUMP_PREFIX_OFFSET, 16, 1, + start, end - start, true); + ceph_mdsmap_destroy(m); + return ERR_PTR(-EINVAL); +} + +void ceph_mdsmap_destroy(struct ceph_mdsmap *m) +{ + int i; + + for (i = 0; i < m->m_max_mds; i++) + kfree(m->m_info[i].export_targets); + kfree(m->m_info); + kfree(m->m_data_pg_pools); + kfree(m); +} diff --git a/fs/ceph/mdsmap.h b/fs/ceph/mdsmap.h new file mode 100644 index 00000000000..eacc131aa5c --- /dev/null +++ b/fs/ceph/mdsmap.h @@ -0,0 +1,54 @@ +#ifndef _FS_CEPH_MDSMAP_H +#define _FS_CEPH_MDSMAP_H + +#include "types.h" + +/* + * mds map - describe servers in the mds cluster. + * + * we limit fields to those the client actually xcares about + */ +struct ceph_mds_info { + u64 global_id; + struct ceph_entity_addr addr; + s32 state; + int num_export_targets; + u32 *export_targets; +}; + +struct ceph_mdsmap { + u32 m_epoch, m_client_epoch, m_last_failure; + u32 m_root; + u32 m_session_timeout; /* seconds */ + u32 m_session_autoclose; /* seconds */ + u64 m_max_file_size; + u32 m_max_mds; /* size of m_addr, m_state arrays */ + struct ceph_mds_info *m_info; + + /* which object pools file data can be stored in */ + int m_num_data_pg_pools; + u32 *m_data_pg_pools; + u32 m_cas_pg_pool; +}; + +static inline struct ceph_entity_addr * +ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w) +{ + if (w >= m->m_max_mds) + return NULL; + return &m->m_info[w].addr; +} + +static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w) +{ + BUG_ON(w < 0); + if (w >= m->m_max_mds) + return CEPH_MDS_STATE_DNE; + return m->m_info[w].state; +} + +extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m); +extern struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end); +extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m); + +#endif diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c new file mode 100644 index 00000000000..781656a49bf --- /dev/null +++ b/fs/ceph/messenger.c @@ -0,0 +1,2240 @@ +#include "ceph_debug.h" + +#include <linux/crc32c.h> +#include <linux/ctype.h> +#include <linux/highmem.h> +#include <linux/inet.h> +#include <linux/kthread.h> +#include <linux/net.h> +#include <linux/socket.h> +#include <linux/string.h> +#include <net/tcp.h> + +#include "super.h" +#include "messenger.h" +#include "decode.h" +#include "pagelist.h" + +/* + * Ceph uses the messenger to exchange ceph_msg messages with other + * hosts in the system. The messenger provides ordered and reliable + * delivery. We tolerate TCP disconnects by reconnecting (with + * exponential backoff) in the case of a fault (disconnection, bad + * crc, protocol error). Acks allow sent messages to be discarded by + * the sender. + */ + +/* static tag bytes (protocol control messages) */ +static char tag_msg = CEPH_MSGR_TAG_MSG; +static char tag_ack = CEPH_MSGR_TAG_ACK; +static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE; + + +static void queue_con(struct ceph_connection *con); +static void con_work(struct work_struct *); +static void ceph_fault(struct ceph_connection *con); + +const char *ceph_name_type_str(int t) +{ + switch (t) { + case CEPH_ENTITY_TYPE_MON: return "mon"; + case CEPH_ENTITY_TYPE_MDS: return "mds"; + case CEPH_ENTITY_TYPE_OSD: return "osd"; + case CEPH_ENTITY_TYPE_CLIENT: return "client"; + case CEPH_ENTITY_TYPE_ADMIN: return "admin"; + default: return "???"; + } +} + +/* + * nicely render a sockaddr as a string. + */ +#define MAX_ADDR_STR 20 +static char addr_str[MAX_ADDR_STR][40]; +static DEFINE_SPINLOCK(addr_str_lock); +static int last_addr_str; + +const char *pr_addr(const struct sockaddr_storage *ss) +{ + int i; + char *s; + struct sockaddr_in *in4 = (void *)ss; + unsigned char *quad = (void *)&in4->sin_addr.s_addr; + struct sockaddr_in6 *in6 = (void *)ss; + + spin_lock(&addr_str_lock); + i = last_addr_str++; + if (last_addr_str == MAX_ADDR_STR) + last_addr_str = 0; + spin_unlock(&addr_str_lock); + s = addr_str[i]; + + switch (ss->ss_family) { + case AF_INET: + sprintf(s, "%u.%u.%u.%u:%u", + (unsigned int)quad[0], + (unsigned int)quad[1], + (unsigned int)quad[2], + (unsigned int)quad[3], + (unsigned int)ntohs(in4->sin_port)); + break; + + case AF_INET6: + sprintf(s, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x:%u", + in6->sin6_addr.s6_addr16[0], + in6->sin6_addr.s6_addr16[1], + in6->sin6_addr.s6_addr16[2], + in6->sin6_addr.s6_addr16[3], + in6->sin6_addr.s6_addr16[4], + in6->sin6_addr.s6_addr16[5], + in6->sin6_addr.s6_addr16[6], + in6->sin6_addr.s6_addr16[7], + (unsigned int)ntohs(in6->sin6_port)); + break; + + default: + sprintf(s, "(unknown sockaddr family %d)", (int)ss->ss_family); + } + + return s; +} + +static void encode_my_addr(struct ceph_messenger *msgr) +{ + memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr)); + ceph_encode_addr(&msgr->my_enc_addr); +} + +/* + * work queue for all reading and writing to/from the socket. + */ +struct workqueue_struct *ceph_msgr_wq; + +int __init ceph_msgr_init(void) +{ + ceph_msgr_wq = create_workqueue("ceph-msgr"); + if (IS_ERR(ceph_msgr_wq)) { + int ret = PTR_ERR(ceph_msgr_wq); + pr_err("msgr_init failed to create workqueue: %d\n", ret); + ceph_msgr_wq = NULL; + return ret; + } + return 0; +} + +void ceph_msgr_exit(void) +{ + destroy_workqueue(ceph_msgr_wq); +} + +/* + * socket callback functions + */ + +/* data available on socket, or listen socket received a connect */ +static void ceph_data_ready(struct sock *sk, int count_unused) +{ + struct ceph_connection *con = + (struct ceph_connection *)sk->sk_user_data; + if (sk->sk_state != TCP_CLOSE_WAIT) { + dout("ceph_data_ready on %p state = %lu, queueing work\n", + con, con->state); + queue_con(con); + } +} + +/* socket has buffer space for writing */ +static void ceph_write_space(struct sock *sk) +{ + struct ceph_connection *con = + (struct ceph_connection *)sk->sk_user_data; + + /* only queue to workqueue if there is data we want to write. */ + if (test_bit(WRITE_PENDING, &con->state)) { + dout("ceph_write_space %p queueing write work\n", con); + queue_con(con); + } else { + dout("ceph_write_space %p nothing to write\n", con); + } + + /* since we have our own write_space, clear the SOCK_NOSPACE flag */ + clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); +} + +/* socket's state has changed */ +static void ceph_state_change(struct sock *sk) +{ + struct ceph_connection *con = + (struct ceph_connection *)sk->sk_user_data; + + dout("ceph_state_change %p state = %lu sk_state = %u\n", + con, con->state, sk->sk_state); + + if (test_bit(CLOSED, &con->state)) + return; + + switch (sk->sk_state) { + case TCP_CLOSE: + dout("ceph_state_change TCP_CLOSE\n"); + case TCP_CLOSE_WAIT: + dout("ceph_state_change TCP_CLOSE_WAIT\n"); + if (test_and_set_bit(SOCK_CLOSED, &con->state) == 0) { + if (test_bit(CONNECTING, &con->state)) + con->error_msg = "connection failed"; + else + con->error_msg = "socket closed"; + queue_con(con); + } + break; + case TCP_ESTABLISHED: + dout("ceph_state_change TCP_ESTABLISHED\n"); + queue_con(con); + break; + } +} + +/* + * set up socket callbacks + */ +static void set_sock_callbacks(struct socket *sock, + struct ceph_connection *con) +{ + struct sock *sk = sock->sk; + sk->sk_user_data = (void *)con; + sk->sk_data_ready = ceph_data_ready; + sk->sk_write_space = ceph_write_space; + sk->sk_state_change = ceph_state_change; +} + + +/* + * socket helpers + */ + +/* + * initiate connection to a remote socket. + */ +static struct socket *ceph_tcp_connect(struct ceph_connection *con) +{ + struct sockaddr *paddr = (struct sockaddr *)&con->peer_addr.in_addr; + struct socket *sock; + int ret; + + BUG_ON(con->sock); + ret = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); + if (ret) + return ERR_PTR(ret); + con->sock = sock; + sock->sk->sk_allocation = GFP_NOFS; + + set_sock_callbacks(sock, con); + + dout("connect %s\n", pr_addr(&con->peer_addr.in_addr)); + + ret = sock->ops->connect(sock, paddr, sizeof(*paddr), O_NONBLOCK); + if (ret == -EINPROGRESS) { + dout("connect %s EINPROGRESS sk_state = %u\n", + pr_addr(&con->peer_addr.in_addr), + sock->sk->sk_state); + ret = 0; + } + if (ret < 0) { + pr_err("connect %s error %d\n", + pr_addr(&con->peer_addr.in_addr), ret); + sock_release(sock); + con->sock = NULL; + con->error_msg = "connect error"; + } + + if (ret < 0) + return ERR_PTR(ret); + return sock; +} + +static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len) +{ + struct kvec iov = {buf, len}; + struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL }; + + return kernel_recvmsg(sock, &msg, &iov, 1, len, msg.msg_flags); +} + +/* + * write something. @more is true if caller will be sending more data + * shortly. + */ +static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov, + size_t kvlen, size_t len, int more) +{ + struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL }; + + if (more) + msg.msg_flags |= MSG_MORE; + else + msg.msg_flags |= MSG_EOR; /* superfluous, but what the hell */ + + return kernel_sendmsg(sock, &msg, iov, kvlen, len); +} + + +/* + * Shutdown/close the socket for the given connection. + */ +static int con_close_socket(struct ceph_connection *con) +{ + int rc; + + dout("con_close_socket on %p sock %p\n", con, con->sock); + if (!con->sock) + return 0; + set_bit(SOCK_CLOSED, &con->state); + rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR); + sock_release(con->sock); + con->sock = NULL; + clear_bit(SOCK_CLOSED, &con->state); + return rc; +} + +/* + * Reset a connection. Discard all incoming and outgoing messages + * and clear *_seq state. + */ +static void ceph_msg_remove(struct ceph_msg *msg) +{ + list_del_init(&msg->list_head); + ceph_msg_put(msg); +} +static void ceph_msg_remove_list(struct list_head *head) +{ + while (!list_empty(head)) { + struct ceph_msg *msg = list_first_entry(head, struct ceph_msg, + list_head); + ceph_msg_remove(msg); + } +} + +static void reset_connection(struct ceph_connection *con) +{ + /* reset connection, out_queue, msg_ and connect_seq */ + /* discard existing out_queue and msg_seq */ + ceph_msg_remove_list(&con->out_queue); + ceph_msg_remove_list(&con->out_sent); + + if (con->in_msg) { + ceph_msg_put(con->in_msg); + con->in_msg = NULL; + } + + con->connect_seq = 0; + con->out_seq = 0; + if (con->out_msg) { + ceph_msg_put(con->out_msg); + con->out_msg = NULL; + } + con->in_seq = 0; +} + +/* + * mark a peer down. drop any open connections. + */ +void ceph_con_close(struct ceph_connection *con) +{ + dout("con_close %p peer %s\n", con, pr_addr(&con->peer_addr.in_addr)); + set_bit(CLOSED, &con->state); /* in case there's queued work */ + clear_bit(STANDBY, &con->state); /* avoid connect_seq bump */ + clear_bit(LOSSYTX, &con->state); /* so we retry next connect */ + clear_bit(KEEPALIVE_PENDING, &con->state); + clear_bit(WRITE_PENDING, &con->state); + mutex_lock(&con->mutex); + reset_connection(con); + cancel_delayed_work(&con->work); + mutex_unlock(&con->mutex); + queue_con(con); +} + +/* + * Reopen a closed connection, with a new peer address. + */ +void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr) +{ + dout("con_open %p %s\n", con, pr_addr(&addr->in_addr)); + set_bit(OPENING, &con->state); + clear_bit(CLOSED, &con->state); + memcpy(&con->peer_addr, addr, sizeof(*addr)); + con->delay = 0; /* reset backoff memory */ + queue_con(con); +} + +/* + * generic get/put + */ +struct ceph_connection *ceph_con_get(struct ceph_connection *con) +{ + dout("con_get %p nref = %d -> %d\n", con, + atomic_read(&con->nref), atomic_read(&con->nref) + 1); + if (atomic_inc_not_zero(&con->nref)) + return con; + return NULL; +} + +void ceph_con_put(struct ceph_connection *con) +{ + dout("con_put %p nref = %d -> %d\n", con, + atomic_read(&con->nref), atomic_read(&con->nref) - 1); + BUG_ON(atomic_read(&con->nref) == 0); + if (atomic_dec_and_test(&con->nref)) { + BUG_ON(con->sock); + kfree(con); + } +} + +/* + * initialize a new connection. + */ +void ceph_con_init(struct ceph_messenger *msgr, struct ceph_connection *con) +{ + dout("con_init %p\n", con); + memset(con, 0, sizeof(*con)); + atomic_set(&con->nref, 1); + con->msgr = msgr; + mutex_init(&con->mutex); + INIT_LIST_HEAD(&con->out_queue); + INIT_LIST_HEAD(&con->out_sent); + INIT_DELAYED_WORK(&con->work, con_work); +} + + +/* + * We maintain a global counter to order connection attempts. Get + * a unique seq greater than @gt. + */ +static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt) +{ + u32 ret; + + spin_lock(&msgr->global_seq_lock); + if (msgr->global_seq < gt) + msgr->global_seq = gt; + ret = ++msgr->global_seq; + spin_unlock(&msgr->global_seq_lock); + return ret; +} + + +/* + * Prepare footer for currently outgoing message, and finish things + * off. Assumes out_kvec* are already valid.. we just add on to the end. + */ +static void prepare_write_message_footer(struct ceph_connection *con, int v) +{ + struct ceph_msg *m = con->out_msg; + + dout("prepare_write_message_footer %p\n", con); + con->out_kvec_is_msg = true; + con->out_kvec[v].iov_base = &m->footer; + con->out_kvec[v].iov_len = sizeof(m->footer); + con->out_kvec_bytes += sizeof(m->footer); + con->out_kvec_left++; + con->out_more = m->more_to_follow; + con->out_msg_done = true; +} + +/* + * Prepare headers for the next outgoing message. + */ +static void prepare_write_message(struct ceph_connection *con) +{ + struct ceph_msg *m; + int v = 0; + + con->out_kvec_bytes = 0; + con->out_kvec_is_msg = true; + con->out_msg_done = false; + + /* Sneak an ack in there first? If we can get it into the same + * TCP packet that's a good thing. */ + if (con->in_seq > con->in_seq_acked) { + con->in_seq_acked = con->in_seq; + con->out_kvec[v].iov_base = &tag_ack; + con->out_kvec[v++].iov_len = 1; + con->out_temp_ack = cpu_to_le64(con->in_seq_acked); + con->out_kvec[v].iov_base = &con->out_temp_ack; + con->out_kvec[v++].iov_len = sizeof(con->out_temp_ack); + con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack); + } + + m = list_first_entry(&con->out_queue, + struct ceph_msg, list_head); + con->out_msg = m; + if (test_bit(LOSSYTX, &con->state)) { + list_del_init(&m->list_head); + } else { + /* put message on sent list */ + ceph_msg_get(m); + list_move_tail(&m->list_head, &con->out_sent); + } + + m->hdr.seq = cpu_to_le64(++con->out_seq); + + dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n", + m, con->out_seq, le16_to_cpu(m->hdr.type), + le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len), + le32_to_cpu(m->hdr.data_len), + m->nr_pages); + BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len); + + /* tag + hdr + front + middle */ + con->out_kvec[v].iov_base = &tag_msg; + con->out_kvec[v++].iov_len = 1; + con->out_kvec[v].iov_base = &m->hdr; + con->out_kvec[v++].iov_len = sizeof(m->hdr); + con->out_kvec[v++] = m->front; + if (m->middle) + con->out_kvec[v++] = m->middle->vec; + con->out_kvec_left = v; + con->out_kvec_bytes += 1 + sizeof(m->hdr) + m->front.iov_len + + (m->middle ? m->middle->vec.iov_len : 0); + con->out_kvec_cur = con->out_kvec; + + /* fill in crc (except data pages), footer */ + con->out_msg->hdr.crc = + cpu_to_le32(crc32c(0, (void *)&m->hdr, + sizeof(m->hdr) - sizeof(m->hdr.crc))); + con->out_msg->footer.flags = CEPH_MSG_FOOTER_COMPLETE; + con->out_msg->footer.front_crc = + cpu_to_le32(crc32c(0, m->front.iov_base, m->front.iov_len)); + if (m->middle) + con->out_msg->footer.middle_crc = + cpu_to_le32(crc32c(0, m->middle->vec.iov_base, + m->middle->vec.iov_len)); + else + con->out_msg->footer.middle_crc = 0; + con->out_msg->footer.data_crc = 0; + dout("prepare_write_message front_crc %u data_crc %u\n", + le32_to_cpu(con->out_msg->footer.front_crc), + le32_to_cpu(con->out_msg->footer.middle_crc)); + + /* is there a data payload? */ + if (le32_to_cpu(m->hdr.data_len) > 0) { + /* initialize page iterator */ + con->out_msg_pos.page = 0; + con->out_msg_pos.page_pos = + le16_to_cpu(m->hdr.data_off) & ~PAGE_MASK; + con->out_msg_pos.data_pos = 0; + con->out_msg_pos.did_page_crc = 0; + con->out_more = 1; /* data + footer will follow */ + } else { + /* no, queue up footer too and be done */ + prepare_write_message_footer(con, v); + } + + set_bit(WRITE_PENDING, &con->state); +} + +/* + * Prepare an ack. + */ +static void prepare_write_ack(struct ceph_connection *con) +{ + dout("prepare_write_ack %p %llu -> %llu\n", con, + con->in_seq_acked, con->in_seq); + con->in_seq_acked = con->in_seq; + + con->out_kvec[0].iov_base = &tag_ack; + con->out_kvec[0].iov_len = 1; + con->out_temp_ack = cpu_to_le64(con->in_seq_acked); + con->out_kvec[1].iov_base = &con->out_temp_ack; + con->out_kvec[1].iov_len = sizeof(con->out_temp_ack); + con->out_kvec_left = 2; + con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack); + con->out_kvec_cur = con->out_kvec; + con->out_more = 1; /* more will follow.. eventually.. */ + set_bit(WRITE_PENDING, &con->state); +} + +/* + * Prepare to write keepalive byte. + */ +static void prepare_write_keepalive(struct ceph_connection *con) +{ + dout("prepare_write_keepalive %p\n", con); + con->out_kvec[0].iov_base = &tag_keepalive; + con->out_kvec[0].iov_len = 1; + con->out_kvec_left = 1; + con->out_kvec_bytes = 1; + con->out_kvec_cur = con->out_kvec; + set_bit(WRITE_PENDING, &con->state); +} + +/* + * Connection negotiation. + */ + +static void prepare_connect_authorizer(struct ceph_connection *con) +{ + void *auth_buf; + int auth_len = 0; + int auth_protocol = 0; + + mutex_unlock(&con->mutex); + if (con->ops->get_authorizer) + con->ops->get_authorizer(con, &auth_buf, &auth_len, + &auth_protocol, &con->auth_reply_buf, + &con->auth_reply_buf_len, + con->auth_retry); + mutex_lock(&con->mutex); + + con->out_connect.authorizer_protocol = cpu_to_le32(auth_protocol); + con->out_connect.authorizer_len = cpu_to_le32(auth_len); + + con->out_kvec[con->out_kvec_left].iov_base = auth_buf; + con->out_kvec[con->out_kvec_left].iov_len = auth_len; + con->out_kvec_left++; + con->out_kvec_bytes += auth_len; +} + +/* + * We connected to a peer and are saying hello. + */ +static void prepare_write_banner(struct ceph_messenger *msgr, + struct ceph_connection *con) +{ + int len = strlen(CEPH_BANNER); + + con->out_kvec[0].iov_base = CEPH_BANNER; + con->out_kvec[0].iov_len = len; + con->out_kvec[1].iov_base = &msgr->my_enc_addr; + con->out_kvec[1].iov_len = sizeof(msgr->my_enc_addr); + con->out_kvec_left = 2; + con->out_kvec_bytes = len + sizeof(msgr->my_enc_addr); + con->out_kvec_cur = con->out_kvec; + con->out_more = 0; + set_bit(WRITE_PENDING, &con->state); +} + +static void prepare_write_connect(struct ceph_messenger *msgr, + struct ceph_connection *con, + int after_banner) +{ + unsigned global_seq = get_global_seq(con->msgr, 0); + int proto; + + switch (con->peer_name.type) { + case CEPH_ENTITY_TYPE_MON: + proto = CEPH_MONC_PROTOCOL; + break; + case CEPH_ENTITY_TYPE_OSD: + proto = CEPH_OSDC_PROTOCOL; + break; + case CEPH_ENTITY_TYPE_MDS: + proto = CEPH_MDSC_PROTOCOL; + break; + default: + BUG(); + } + + dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con, + con->connect_seq, global_seq, proto); + + con->out_connect.features = CEPH_FEATURE_SUPPORTED; + con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT); + con->out_connect.connect_seq = cpu_to_le32(con->connect_seq); + con->out_connect.global_seq = cpu_to_le32(global_seq); + con->out_connect.protocol_version = cpu_to_le32(proto); + con->out_connect.flags = 0; + + if (!after_banner) { + con->out_kvec_left = 0; + con->out_kvec_bytes = 0; + } + con->out_kvec[con->out_kvec_left].iov_base = &con->out_connect; + con->out_kvec[con->out_kvec_left].iov_len = sizeof(con->out_connect); + con->out_kvec_left++; + con->out_kvec_bytes += sizeof(con->out_connect); + con->out_kvec_cur = con->out_kvec; + con->out_more = 0; + set_bit(WRITE_PENDING, &con->state); + + prepare_connect_authorizer(con); +} + + +/* + * write as much of pending kvecs to the socket as we can. + * 1 -> done + * 0 -> socket full, but more to do + * <0 -> error + */ +static int write_partial_kvec(struct ceph_connection *con) +{ + int ret; + + dout("write_partial_kvec %p %d left\n", con, con->out_kvec_bytes); + while (con->out_kvec_bytes > 0) { + ret = ceph_tcp_sendmsg(con->sock, con->out_kvec_cur, + con->out_kvec_left, con->out_kvec_bytes, + con->out_more); + if (ret <= 0) + goto out; + con->out_kvec_bytes -= ret; + if (con->out_kvec_bytes == 0) + break; /* done */ + while (ret > 0) { + if (ret >= con->out_kvec_cur->iov_len) { + ret -= con->out_kvec_cur->iov_len; + con->out_kvec_cur++; + con->out_kvec_left--; + } else { + con->out_kvec_cur->iov_len -= ret; + con->out_kvec_cur->iov_base += ret; + ret = 0; + break; + } + } + } + con->out_kvec_left = 0; + con->out_kvec_is_msg = false; + ret = 1; +out: + dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con, + con->out_kvec_bytes, con->out_kvec_left, ret); + return ret; /* done! */ +} + +/* + * Write as much message data payload as we can. If we finish, queue + * up the footer. + * 1 -> done, footer is now queued in out_kvec[]. + * 0 -> socket full, but more to do + * <0 -> error + */ +static int write_partial_msg_pages(struct ceph_connection *con) +{ + struct ceph_msg *msg = con->out_msg; + unsigned data_len = le32_to_cpu(msg->hdr.data_len); + size_t len; + int crc = con->msgr->nocrc; + int ret; + + dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n", + con, con->out_msg, con->out_msg_pos.page, con->out_msg->nr_pages, + con->out_msg_pos.page_pos); + + while (con->out_msg_pos.page < con->out_msg->nr_pages) { + struct page *page = NULL; + void *kaddr = NULL; + + /* + * if we are calculating the data crc (the default), we need + * to map the page. if our pages[] has been revoked, use the + * zero page. + */ + if (msg->pages) { + page = msg->pages[con->out_msg_pos.page]; + if (crc) + kaddr = kmap(page); + } else if (msg->pagelist) { + page = list_first_entry(&msg->pagelist->head, + struct page, lru); + if (crc) + kaddr = kmap(page); + } else { + page = con->msgr->zero_page; + if (crc) + kaddr = page_address(con->msgr->zero_page); + } + len = min((int)(PAGE_SIZE - con->out_msg_pos.page_pos), + (int)(data_len - con->out_msg_pos.data_pos)); + if (crc && !con->out_msg_pos.did_page_crc) { + void *base = kaddr + con->out_msg_pos.page_pos; + u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc); + + BUG_ON(kaddr == NULL); + con->out_msg->footer.data_crc = + cpu_to_le32(crc32c(tmpcrc, base, len)); + con->out_msg_pos.did_page_crc = 1; + } + + ret = kernel_sendpage(con->sock, page, + con->out_msg_pos.page_pos, len, + MSG_DONTWAIT | MSG_NOSIGNAL | + MSG_MORE); + + if (crc && (msg->pages || msg->pagelist)) + kunmap(page); + + if (ret <= 0) + goto out; + + con->out_msg_pos.data_pos += ret; + con->out_msg_pos.page_pos += ret; + if (ret == len) { + con->out_msg_pos.page_pos = 0; + con->out_msg_pos.page++; + con->out_msg_pos.did_page_crc = 0; + if (msg->pagelist) + list_move_tail(&page->lru, + &msg->pagelist->head); + } + } + + dout("write_partial_msg_pages %p msg %p done\n", con, msg); + + /* prepare and queue up footer, too */ + if (!crc) + con->out_msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC; + con->out_kvec_bytes = 0; + con->out_kvec_left = 0; + con->out_kvec_cur = con->out_kvec; + prepare_write_message_footer(con, 0); + ret = 1; +out: + return ret; +} + +/* + * write some zeros + */ +static int write_partial_skip(struct ceph_connection *con) +{ + int ret; + + while (con->out_skip > 0) { + struct kvec iov = { + .iov_base = page_address(con->msgr->zero_page), + .iov_len = min(con->out_skip, (int)PAGE_CACHE_SIZE) + }; + + ret = ceph_tcp_sendmsg(con->sock, &iov, 1, iov.iov_len, 1); + if (ret <= 0) + goto out; + con->out_skip -= ret; + } + ret = 1; +out: + return ret; +} + +/* + * Prepare to read connection handshake, or an ack. + */ +static void prepare_read_banner(struct ceph_connection *con) +{ + dout("prepare_read_banner %p\n", con); + con->in_base_pos = 0; +} + +static void prepare_read_connect(struct ceph_connection *con) +{ + dout("prepare_read_connect %p\n", con); + con->in_base_pos = 0; +} + +static void prepare_read_connect_retry(struct ceph_connection *con) +{ + dout("prepare_read_connect_retry %p\n", con); + con->in_base_pos = strlen(CEPH_BANNER) + sizeof(con->actual_peer_addr) + + sizeof(con->peer_addr_for_me); +} + +static void prepare_read_ack(struct ceph_connection *con) +{ + dout("prepare_read_ack %p\n", con); + con->in_base_pos = 0; +} + +static void prepare_read_tag(struct ceph_connection *con) +{ + dout("prepare_read_tag %p\n", con); + con->in_base_pos = 0; + con->in_tag = CEPH_MSGR_TAG_READY; +} + +/* + * Prepare to read a message. + */ +static int prepare_read_message(struct ceph_connection *con) +{ + dout("prepare_read_message %p\n", con); + BUG_ON(con->in_msg != NULL); + con->in_base_pos = 0; + con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0; + return 0; +} + + +static int read_partial(struct ceph_connection *con, + int *to, int size, void *object) +{ + *to += size; + while (con->in_base_pos < *to) { + int left = *to - con->in_base_pos; + int have = size - left; + int ret = ceph_tcp_recvmsg(con->sock, object + have, left); + if (ret <= 0) + return ret; + con->in_base_pos += ret; + } + return 1; +} + + +/* + * Read all or part of the connect-side handshake on a new connection + */ +static int read_partial_banner(struct ceph_connection *con) +{ + int ret, to = 0; + + dout("read_partial_banner %p at %d\n", con, con->in_base_pos); + + /* peer's banner */ + ret = read_partial(con, &to, strlen(CEPH_BANNER), con->in_banner); + if (ret <= 0) + goto out; + ret = read_partial(con, &to, sizeof(con->actual_peer_addr), + &con->actual_peer_addr); + if (ret <= 0) + goto out; + ret = read_partial(con, &to, sizeof(con->peer_addr_for_me), + &con->peer_addr_for_me); + if (ret <= 0) + goto out; +out: + return ret; +} + +static int read_partial_connect(struct ceph_connection *con) +{ + int ret, to = 0; + + dout("read_partial_connect %p at %d\n", con, con->in_base_pos); + + ret = read_partial(con, &to, sizeof(con->in_reply), &con->in_reply); + if (ret <= 0) + goto out; + ret = read_partial(con, &to, le32_to_cpu(con->in_reply.authorizer_len), + con->auth_reply_buf); + if (ret <= 0) + goto out; + + dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n", + con, (int)con->in_reply.tag, + le32_to_cpu(con->in_reply.connect_seq), + le32_to_cpu(con->in_reply.global_seq)); +out: + return ret; + +} + +/* + * Verify the hello banner looks okay. + */ +static int verify_hello(struct ceph_connection *con) +{ + if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) { + pr_err("connect to %s got bad banner\n", + pr_addr(&con->peer_addr.in_addr)); + con->error_msg = "protocol error, bad banner"; + return -1; + } + return 0; +} + +static bool addr_is_blank(struct sockaddr_storage *ss) +{ + switch (ss->ss_family) { + case AF_INET: + return ((struct sockaddr_in *)ss)->sin_addr.s_addr == 0; + case AF_INET6: + return + ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[0] == 0 && + ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[1] == 0 && + ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[2] == 0 && + ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[3] == 0; + } + return false; +} + +static int addr_port(struct sockaddr_storage *ss) +{ + switch (ss->ss_family) { + case AF_INET: + return ntohs(((struct sockaddr_in *)ss)->sin_port); + case AF_INET6: + return ntohs(((struct sockaddr_in6 *)ss)->sin6_port); + } + return 0; +} + +static void addr_set_port(struct sockaddr_storage *ss, int p) +{ + switch (ss->ss_family) { + case AF_INET: + ((struct sockaddr_in *)ss)->sin_port = htons(p); + case AF_INET6: + ((struct sockaddr_in6 *)ss)->sin6_port = htons(p); + } +} + +/* + * Parse an ip[:port] list into an addr array. Use the default + * monitor port if a port isn't specified. + */ +int ceph_parse_ips(const char *c, const char *end, + struct ceph_entity_addr *addr, + int max_count, int *count) +{ + int i; + const char *p = c; + + dout("parse_ips on '%.*s'\n", (int)(end-c), c); + for (i = 0; i < max_count; i++) { + const char *ipend; + struct sockaddr_storage *ss = &addr[i].in_addr; + struct sockaddr_in *in4 = (void *)ss; + struct sockaddr_in6 *in6 = (void *)ss; + int port; + + memset(ss, 0, sizeof(*ss)); + if (in4_pton(p, end - p, (u8 *)&in4->sin_addr.s_addr, + ',', &ipend)) { + ss->ss_family = AF_INET; + } else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr, + ',', &ipend)) { + ss->ss_family = AF_INET6; + } else { + goto bad; + } + p = ipend; + + /* port? */ + if (p < end && *p == ':') { + port = 0; + p++; + while (p < end && *p >= '0' && *p <= '9') { + port = (port * 10) + (*p - '0'); + p++; + } + if (port > 65535 || port == 0) + goto bad; + } else { + port = CEPH_MON_PORT; + } + + addr_set_port(ss, port); + + dout("parse_ips got %s\n", pr_addr(ss)); + + if (p == end) + break; + if (*p != ',') + goto bad; + p++; + } + + if (p != end) + goto bad; + + if (count) + *count = i + 1; + return 0; + +bad: + pr_err("parse_ips bad ip '%s'\n", c); + return -EINVAL; +} + +static int process_banner(struct ceph_connection *con) +{ + dout("process_banner on %p\n", con); + + if (verify_hello(con) < 0) + return -1; + + ceph_decode_addr(&con->actual_peer_addr); + ceph_decode_addr(&con->peer_addr_for_me); + + /* + * Make sure the other end is who we wanted. note that the other + * end may not yet know their ip address, so if it's 0.0.0.0, give + * them the benefit of the doubt. + */ + if (memcmp(&con->peer_addr, &con->actual_peer_addr, + sizeof(con->peer_addr)) != 0 && + !(addr_is_blank(&con->actual_peer_addr.in_addr) && + con->actual_peer_addr.nonce == con->peer_addr.nonce)) { + pr_warning("wrong peer, want %s/%lld, got %s/%lld\n", + pr_addr(&con->peer_addr.in_addr), + le64_to_cpu(con->peer_addr.nonce), + pr_addr(&con->actual_peer_addr.in_addr), + le64_to_cpu(con->actual_peer_addr.nonce)); + con->error_msg = "wrong peer at address"; + return -1; + } + + /* + * did we learn our address? + */ + if (addr_is_blank(&con->msgr->inst.addr.in_addr)) { + int port = addr_port(&con->msgr->inst.addr.in_addr); + + memcpy(&con->msgr->inst.addr.in_addr, + &con->peer_addr_for_me.in_addr, + sizeof(con->peer_addr_for_me.in_addr)); + addr_set_port(&con->msgr->inst.addr.in_addr, port); + encode_my_addr(con->msgr); + dout("process_banner learned my addr is %s\n", + pr_addr(&con->msgr->inst.addr.in_addr)); + } + + set_bit(NEGOTIATING, &con->state); + prepare_read_connect(con); + return 0; +} + +static void fail_protocol(struct ceph_connection *con) +{ + reset_connection(con); + set_bit(CLOSED, &con->state); /* in case there's queued work */ + + mutex_unlock(&con->mutex); + if (con->ops->bad_proto) + con->ops->bad_proto(con); + mutex_lock(&con->mutex); +} + +static int process_connect(struct ceph_connection *con) +{ + u64 sup_feat = CEPH_FEATURE_SUPPORTED; + u64 req_feat = CEPH_FEATURE_REQUIRED; + u64 server_feat = le64_to_cpu(con->in_reply.features); + + dout("process_connect on %p tag %d\n", con, (int)con->in_tag); + + switch (con->in_reply.tag) { + case CEPH_MSGR_TAG_FEATURES: + pr_err("%s%lld %s feature set mismatch," + " my %llx < server's %llx, missing %llx\n", + ENTITY_NAME(con->peer_name), + pr_addr(&con->peer_addr.in_addr), + sup_feat, server_feat, server_feat & ~sup_feat); + con->error_msg = "missing required protocol features"; + fail_protocol(con); + return -1; + + case CEPH_MSGR_TAG_BADPROTOVER: + pr_err("%s%lld %s protocol version mismatch," + " my %d != server's %d\n", + ENTITY_NAME(con->peer_name), + pr_addr(&con->peer_addr.in_addr), + le32_to_cpu(con->out_connect.protocol_version), + le32_to_cpu(con->in_reply.protocol_version)); + con->error_msg = "protocol version mismatch"; + fail_protocol(con); + return -1; + + case CEPH_MSGR_TAG_BADAUTHORIZER: + con->auth_retry++; + dout("process_connect %p got BADAUTHORIZER attempt %d\n", con, + con->auth_retry); + if (con->auth_retry == 2) { + con->error_msg = "connect authorization failure"; + reset_connection(con); + set_bit(CLOSED, &con->state); + return -1; + } + con->auth_retry = 1; + prepare_write_connect(con->msgr, con, 0); + prepare_read_connect_retry(con); + break; + + case CEPH_MSGR_TAG_RESETSESSION: + /* + * If we connected with a large connect_seq but the peer + * has no record of a session with us (no connection, or + * connect_seq == 0), they will send RESETSESION to indicate + * that they must have reset their session, and may have + * dropped messages. + */ + dout("process_connect got RESET peer seq %u\n", + le32_to_cpu(con->in_connect.connect_seq)); + pr_err("%s%lld %s connection reset\n", + ENTITY_NAME(con->peer_name), + pr_addr(&con->peer_addr.in_addr)); + reset_connection(con); + prepare_write_connect(con->msgr, con, 0); + prepare_read_connect(con); + + /* Tell ceph about it. */ + mutex_unlock(&con->mutex); + pr_info("reset on %s%lld\n", ENTITY_NAME(con->peer_name)); + if (con->ops->peer_reset) + con->ops->peer_reset(con); + mutex_lock(&con->mutex); + break; + + case CEPH_MSGR_TAG_RETRY_SESSION: + /* + * If we sent a smaller connect_seq than the peer has, try + * again with a larger value. + */ + dout("process_connect got RETRY my seq = %u, peer_seq = %u\n", + le32_to_cpu(con->out_connect.connect_seq), + le32_to_cpu(con->in_connect.connect_seq)); + con->connect_seq = le32_to_cpu(con->in_connect.connect_seq); + prepare_write_connect(con->msgr, con, 0); + prepare_read_connect(con); + break; + + case CEPH_MSGR_TAG_RETRY_GLOBAL: + /* + * If we sent a smaller global_seq than the peer has, try + * again with a larger value. + */ + dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n", + con->peer_global_seq, + le32_to_cpu(con->in_connect.global_seq)); + get_global_seq(con->msgr, + le32_to_cpu(con->in_connect.global_seq)); + prepare_write_connect(con->msgr, con, 0); + prepare_read_connect(con); + break; + + case CEPH_MSGR_TAG_READY: + if (req_feat & ~server_feat) { + pr_err("%s%lld %s protocol feature mismatch," + " my required %llx > server's %llx, need %llx\n", + ENTITY_NAME(con->peer_name), + pr_addr(&con->peer_addr.in_addr), + req_feat, server_feat, req_feat & ~server_feat); + con->error_msg = "missing required protocol features"; + fail_protocol(con); + return -1; + } + clear_bit(CONNECTING, &con->state); + con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq); + con->connect_seq++; + dout("process_connect got READY gseq %d cseq %d (%d)\n", + con->peer_global_seq, + le32_to_cpu(con->in_reply.connect_seq), + con->connect_seq); + WARN_ON(con->connect_seq != + le32_to_cpu(con->in_reply.connect_seq)); + + if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY) + set_bit(LOSSYTX, &con->state); + + prepare_read_tag(con); + break; + + case CEPH_MSGR_TAG_WAIT: + /* + * If there is a connection race (we are opening + * connections to each other), one of us may just have + * to WAIT. This shouldn't happen if we are the + * client. + */ + pr_err("process_connect peer connecting WAIT\n"); + + default: + pr_err("connect protocol error, will retry\n"); + con->error_msg = "protocol error, garbage tag during connect"; + return -1; + } + return 0; +} + + +/* + * read (part of) an ack + */ +static int read_partial_ack(struct ceph_connection *con) +{ + int to = 0; + + return read_partial(con, &to, sizeof(con->in_temp_ack), + &con->in_temp_ack); +} + + +/* + * We can finally discard anything that's been acked. + */ +static void process_ack(struct ceph_connection *con) +{ + struct ceph_msg *m; + u64 ack = le64_to_cpu(con->in_temp_ack); + u64 seq; + + while (!list_empty(&con->out_sent)) { + m = list_first_entry(&con->out_sent, struct ceph_msg, + list_head); + seq = le64_to_cpu(m->hdr.seq); + if (seq > ack) + break; + dout("got ack for seq %llu type %d at %p\n", seq, + le16_to_cpu(m->hdr.type), m); + ceph_msg_remove(m); + } + prepare_read_tag(con); +} + + + + +static int read_partial_message_section(struct ceph_connection *con, + struct kvec *section, unsigned int sec_len, + u32 *crc) +{ + int left; + int ret; + + BUG_ON(!section); + + while (section->iov_len < sec_len) { + BUG_ON(section->iov_base == NULL); + left = sec_len - section->iov_len; + ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base + + section->iov_len, left); + if (ret <= 0) + return ret; + section->iov_len += ret; + if (section->iov_len == sec_len) + *crc = crc32c(0, section->iov_base, + section->iov_len); + } + + return 1; +} + +static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con, + struct ceph_msg_header *hdr, + int *skip); +/* + * read (part of) a message. + */ +static int read_partial_message(struct ceph_connection *con) +{ + struct ceph_msg *m = con->in_msg; + void *p; + int ret; + int to, left; + unsigned front_len, middle_len, data_len, data_off; + int datacrc = con->msgr->nocrc; + int skip; + + dout("read_partial_message con %p msg %p\n", con, m); + + /* header */ + while (con->in_base_pos < sizeof(con->in_hdr)) { + left = sizeof(con->in_hdr) - con->in_base_pos; + ret = ceph_tcp_recvmsg(con->sock, + (char *)&con->in_hdr + con->in_base_pos, + left); + if (ret <= 0) + return ret; + con->in_base_pos += ret; + if (con->in_base_pos == sizeof(con->in_hdr)) { + u32 crc = crc32c(0, (void *)&con->in_hdr, + sizeof(con->in_hdr) - sizeof(con->in_hdr.crc)); + if (crc != le32_to_cpu(con->in_hdr.crc)) { + pr_err("read_partial_message bad hdr " + " crc %u != expected %u\n", + crc, con->in_hdr.crc); + return -EBADMSG; + } + } + } + front_len = le32_to_cpu(con->in_hdr.front_len); + if (front_len > CEPH_MSG_MAX_FRONT_LEN) + return -EIO; + middle_len = le32_to_cpu(con->in_hdr.middle_len); + if (middle_len > CEPH_MSG_MAX_DATA_LEN) + return -EIO; + data_len = le32_to_cpu(con->in_hdr.data_len); + if (data_len > CEPH_MSG_MAX_DATA_LEN) + return -EIO; + data_off = le16_to_cpu(con->in_hdr.data_off); + + /* allocate message? */ + if (!con->in_msg) { + dout("got hdr type %d front %d data %d\n", con->in_hdr.type, + con->in_hdr.front_len, con->in_hdr.data_len); + con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip); + if (skip) { + /* skip this message */ + dout("alloc_msg returned NULL, skipping message\n"); + con->in_base_pos = -front_len - middle_len - data_len - + sizeof(m->footer); + con->in_tag = CEPH_MSGR_TAG_READY; + return 0; + } + if (IS_ERR(con->in_msg)) { + ret = PTR_ERR(con->in_msg); + con->in_msg = NULL; + con->error_msg = + "error allocating memory for incoming message"; + return ret; + } + m = con->in_msg; + m->front.iov_len = 0; /* haven't read it yet */ + if (m->middle) + m->middle->vec.iov_len = 0; + + con->in_msg_pos.page = 0; + con->in_msg_pos.page_pos = data_off & ~PAGE_MASK; + con->in_msg_pos.data_pos = 0; + } + + /* front */ + ret = read_partial_message_section(con, &m->front, front_len, + &con->in_front_crc); + if (ret <= 0) + return ret; + + /* middle */ + if (m->middle) { + ret = read_partial_message_section(con, &m->middle->vec, middle_len, + &con->in_middle_crc); + if (ret <= 0) + return ret; + } + + /* (page) data */ + while (con->in_msg_pos.data_pos < data_len) { + left = min((int)(data_len - con->in_msg_pos.data_pos), + (int)(PAGE_SIZE - con->in_msg_pos.page_pos)); + BUG_ON(m->pages == NULL); + p = kmap(m->pages[con->in_msg_pos.page]); + ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos, + left); + if (ret > 0 && datacrc) + con->in_data_crc = + crc32c(con->in_data_crc, + p + con->in_msg_pos.page_pos, ret); + kunmap(m->pages[con->in_msg_pos.page]); + if (ret <= 0) + return ret; + con->in_msg_pos.data_pos += ret; + con->in_msg_pos.page_pos += ret; + if (con->in_msg_pos.page_pos == PAGE_SIZE) { + con->in_msg_pos.page_pos = 0; + con->in_msg_pos.page++; + } + } + + /* footer */ + to = sizeof(m->hdr) + sizeof(m->footer); + while (con->in_base_pos < to) { + left = to - con->in_base_pos; + ret = ceph_tcp_recvmsg(con->sock, (char *)&m->footer + + (con->in_base_pos - sizeof(m->hdr)), + left); + if (ret <= 0) + return ret; + con->in_base_pos += ret; + } + dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n", + m, front_len, m->footer.front_crc, middle_len, + m->footer.middle_crc, data_len, m->footer.data_crc); + + /* crc ok? */ + if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) { + pr_err("read_partial_message %p front crc %u != exp. %u\n", + m, con->in_front_crc, m->footer.front_crc); + return -EBADMSG; + } + if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) { + pr_err("read_partial_message %p middle crc %u != exp %u\n", + m, con->in_middle_crc, m->footer.middle_crc); + return -EBADMSG; + } + if (datacrc && + (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 && + con->in_data_crc != le32_to_cpu(m->footer.data_crc)) { + pr_err("read_partial_message %p data crc %u != exp. %u\n", m, + con->in_data_crc, le32_to_cpu(m->footer.data_crc)); + return -EBADMSG; + } + + return 1; /* done! */ +} + +/* + * Process message. This happens in the worker thread. The callback should + * be careful not to do anything that waits on other incoming messages or it + * may deadlock. + */ +static void process_message(struct ceph_connection *con) +{ + struct ceph_msg *msg; + + msg = con->in_msg; + con->in_msg = NULL; + + /* if first message, set peer_name */ + if (con->peer_name.type == 0) + con->peer_name = msg->hdr.src.name; + + con->in_seq++; + mutex_unlock(&con->mutex); + + dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n", + msg, le64_to_cpu(msg->hdr.seq), + ENTITY_NAME(msg->hdr.src.name), + le16_to_cpu(msg->hdr.type), + ceph_msg_type_name(le16_to_cpu(msg->hdr.type)), + le32_to_cpu(msg->hdr.front_len), + le32_to_cpu(msg->hdr.data_len), + con->in_front_crc, con->in_middle_crc, con->in_data_crc); + con->ops->dispatch(con, msg); + + mutex_lock(&con->mutex); + prepare_read_tag(con); +} + + +/* + * Write something to the socket. Called in a worker thread when the + * socket appears to be writeable and we have something ready to send. + */ +static int try_write(struct ceph_connection *con) +{ + struct ceph_messenger *msgr = con->msgr; + int ret = 1; + + dout("try_write start %p state %lu nref %d\n", con, con->state, + atomic_read(&con->nref)); + + mutex_lock(&con->mutex); +more: + dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes); + + /* open the socket first? */ + if (con->sock == NULL) { + /* + * if we were STANDBY and are reconnecting _this_ + * connection, bump connect_seq now. Always bump + * global_seq. + */ + if (test_and_clear_bit(STANDBY, &con->state)) + con->connect_seq++; + + prepare_write_banner(msgr, con); + prepare_write_connect(msgr, con, 1); + prepare_read_banner(con); + set_bit(CONNECTING, &con->state); + clear_bit(NEGOTIATING, &con->state); + + BUG_ON(con->in_msg); + con->in_tag = CEPH_MSGR_TAG_READY; + dout("try_write initiating connect on %p new state %lu\n", + con, con->state); + con->sock = ceph_tcp_connect(con); + if (IS_ERR(con->sock)) { + con->sock = NULL; + con->error_msg = "connect error"; + ret = -1; + goto out; + } + } + +more_kvec: + /* kvec data queued? */ + if (con->out_skip) { + ret = write_partial_skip(con); + if (ret <= 0) + goto done; + if (ret < 0) { + dout("try_write write_partial_skip err %d\n", ret); + goto done; + } + } + if (con->out_kvec_left) { + ret = write_partial_kvec(con); + if (ret <= 0) + goto done; + } + + /* msg pages? */ + if (con->out_msg) { + if (con->out_msg_done) { + ceph_msg_put(con->out_msg); + con->out_msg = NULL; /* we're done with this one */ + goto do_next; + } + + ret = write_partial_msg_pages(con); + if (ret == 1) + goto more_kvec; /* we need to send the footer, too! */ + if (ret == 0) + goto done; + if (ret < 0) { + dout("try_write write_partial_msg_pages err %d\n", + ret); + goto done; + } + } + +do_next: + if (!test_bit(CONNECTING, &con->state)) { + /* is anything else pending? */ + if (!list_empty(&con->out_queue)) { + prepare_write_message(con); + goto more; + } + if (con->in_seq > con->in_seq_acked) { + prepare_write_ack(con); + goto more; + } + if (test_and_clear_bit(KEEPALIVE_PENDING, &con->state)) { + prepare_write_keepalive(con); + goto more; + } + } + + /* Nothing to do! */ + clear_bit(WRITE_PENDING, &con->state); + dout("try_write nothing else to write.\n"); +done: + ret = 0; +out: + mutex_unlock(&con->mutex); + dout("try_write done on %p\n", con); + return ret; +} + + + +/* + * Read what we can from the socket. + */ +static int try_read(struct ceph_connection *con) +{ + struct ceph_messenger *msgr; + int ret = -1; + + if (!con->sock) + return 0; + + if (test_bit(STANDBY, &con->state)) + return 0; + + dout("try_read start on %p\n", con); + msgr = con->msgr; + + mutex_lock(&con->mutex); + +more: + dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag, + con->in_base_pos); + if (test_bit(CONNECTING, &con->state)) { + if (!test_bit(NEGOTIATING, &con->state)) { + dout("try_read connecting\n"); + ret = read_partial_banner(con); + if (ret <= 0) + goto done; + if (process_banner(con) < 0) { + ret = -1; + goto out; + } + } + ret = read_partial_connect(con); + if (ret <= 0) + goto done; + if (process_connect(con) < 0) { + ret = -1; + goto out; + } + goto more; + } + + if (con->in_base_pos < 0) { + /* + * skipping + discarding content. + * + * FIXME: there must be a better way to do this! + */ + static char buf[1024]; + int skip = min(1024, -con->in_base_pos); + dout("skipping %d / %d bytes\n", skip, -con->in_base_pos); + ret = ceph_tcp_recvmsg(con->sock, buf, skip); + if (ret <= 0) + goto done; + con->in_base_pos += ret; + if (con->in_base_pos) + goto more; + } + if (con->in_tag == CEPH_MSGR_TAG_READY) { + /* + * what's next? + */ + ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1); + if (ret <= 0) + goto done; + dout("try_read got tag %d\n", (int)con->in_tag); + switch (con->in_tag) { + case CEPH_MSGR_TAG_MSG: + prepare_read_message(con); + break; + case CEPH_MSGR_TAG_ACK: + prepare_read_ack(con); + break; + case CEPH_MSGR_TAG_CLOSE: + set_bit(CLOSED, &con->state); /* fixme */ + goto done; + default: + goto bad_tag; + } + } + if (con->in_tag == CEPH_MSGR_TAG_MSG) { + ret = read_partial_message(con); + if (ret <= 0) { + switch (ret) { + case -EBADMSG: + con->error_msg = "bad crc"; + ret = -EIO; + goto out; + case -EIO: + con->error_msg = "io error"; + goto out; + default: + goto done; + } + } + if (con->in_tag == CEPH_MSGR_TAG_READY) + goto more; + process_message(con); + goto more; + } + if (con->in_tag == CEPH_MSGR_TAG_ACK) { + ret = read_partial_ack(con); + if (ret <= 0) + goto done; + process_ack(con); + goto more; + } + +done: + ret = 0; +out: + mutex_unlock(&con->mutex); + dout("try_read done on %p\n", con); + return ret; + +bad_tag: + pr_err("try_read bad con->in_tag = %d\n", (int)con->in_tag); + con->error_msg = "protocol error, garbage tag"; + ret = -1; + goto out; +} + + +/* + * Atomically queue work on a connection. Bump @con reference to + * avoid races with connection teardown. + * + * There is some trickery going on with QUEUED and BUSY because we + * only want a _single_ thread operating on each connection at any + * point in time, but we want to use all available CPUs. + * + * The worker thread only proceeds if it can atomically set BUSY. It + * clears QUEUED and does it's thing. When it thinks it's done, it + * clears BUSY, then rechecks QUEUED.. if it's set again, it loops + * (tries again to set BUSY). + * + * To queue work, we first set QUEUED, _then_ if BUSY isn't set, we + * try to queue work. If that fails (work is already queued, or BUSY) + * we give up (work also already being done or is queued) but leave QUEUED + * set so that the worker thread will loop if necessary. + */ +static void queue_con(struct ceph_connection *con) +{ + if (test_bit(DEAD, &con->state)) { + dout("queue_con %p ignoring: DEAD\n", + con); + return; + } + + if (!con->ops->get(con)) { + dout("queue_con %p ref count 0\n", con); + return; + } + + set_bit(QUEUED, &con->state); + if (test_bit(BUSY, &con->state)) { + dout("queue_con %p - already BUSY\n", con); + con->ops->put(con); + } else if (!queue_work(ceph_msgr_wq, &con->work.work)) { + dout("queue_con %p - already queued\n", con); + con->ops->put(con); + } else { + dout("queue_con %p\n", con); + } +} + +/* + * Do some work on a connection. Drop a connection ref when we're done. + */ +static void con_work(struct work_struct *work) +{ + struct ceph_connection *con = container_of(work, struct ceph_connection, + work.work); + int backoff = 0; + +more: + if (test_and_set_bit(BUSY, &con->state) != 0) { + dout("con_work %p BUSY already set\n", con); + goto out; + } + dout("con_work %p start, clearing QUEUED\n", con); + clear_bit(QUEUED, &con->state); + + if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */ + dout("con_work CLOSED\n"); + con_close_socket(con); + goto done; + } + if (test_and_clear_bit(OPENING, &con->state)) { + /* reopen w/ new peer */ + dout("con_work OPENING\n"); + con_close_socket(con); + } + + if (test_and_clear_bit(SOCK_CLOSED, &con->state) || + try_read(con) < 0 || + try_write(con) < 0) { + backoff = 1; + ceph_fault(con); /* error/fault path */ + } + +done: + clear_bit(BUSY, &con->state); + dout("con->state=%lu\n", con->state); + if (test_bit(QUEUED, &con->state)) { + if (!backoff || test_bit(OPENING, &con->state)) { + dout("con_work %p QUEUED reset, looping\n", con); + goto more; + } + dout("con_work %p QUEUED reset, but just faulted\n", con); + clear_bit(QUEUED, &con->state); + } + dout("con_work %p done\n", con); + +out: + con->ops->put(con); +} + + +/* + * Generic error/fault handler. A retry mechanism is used with + * exponential backoff + */ +static void ceph_fault(struct ceph_connection *con) +{ + pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name), + pr_addr(&con->peer_addr.in_addr), con->error_msg); + dout("fault %p state %lu to peer %s\n", + con, con->state, pr_addr(&con->peer_addr.in_addr)); + + if (test_bit(LOSSYTX, &con->state)) { + dout("fault on LOSSYTX channel\n"); + goto out; + } + + clear_bit(BUSY, &con->state); /* to avoid an improbable race */ + + mutex_lock(&con->mutex); + if (test_bit(CLOSED, &con->state)) + goto out_unlock; + + con_close_socket(con); + + if (con->in_msg) { + ceph_msg_put(con->in_msg); + con->in_msg = NULL; + } + + /* Requeue anything that hasn't been acked */ + list_splice_init(&con->out_sent, &con->out_queue); + + /* If there are no messages in the queue, place the connection + * in a STANDBY state (i.e., don't try to reconnect just yet). */ + if (list_empty(&con->out_queue) && !con->out_keepalive_pending) { + dout("fault setting STANDBY\n"); + set_bit(STANDBY, &con->state); + } else { + /* retry after a delay. */ + if (con->delay == 0) + con->delay = BASE_DELAY_INTERVAL; + else if (con->delay < MAX_DELAY_INTERVAL) + con->delay *= 2; + dout("fault queueing %p delay %lu\n", con, con->delay); + con->ops->get(con); + if (queue_delayed_work(ceph_msgr_wq, &con->work, + round_jiffies_relative(con->delay)) == 0) + con->ops->put(con); + } + +out_unlock: + mutex_unlock(&con->mutex); +out: + /* + * in case we faulted due to authentication, invalidate our + * current tickets so that we can get new ones. + */ + if (con->auth_retry && con->ops->invalidate_authorizer) { + dout("calling invalidate_authorizer()\n"); + con->ops->invalidate_authorizer(con); + } + + if (con->ops->fault) + con->ops->fault(con); +} + + + +/* + * create a new messenger instance + */ +struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr) +{ + struct ceph_messenger *msgr; + + msgr = kzalloc(sizeof(*msgr), GFP_KERNEL); + if (msgr == NULL) + return ERR_PTR(-ENOMEM); + + spin_lock_init(&msgr->global_seq_lock); + + /* the zero page is needed if a request is "canceled" while the message + * is being written over the socket */ + msgr->zero_page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!msgr->zero_page) { + kfree(msgr); + return ERR_PTR(-ENOMEM); + } + kmap(msgr->zero_page); + + if (myaddr) + msgr->inst.addr = *myaddr; + + /* select a random nonce */ + msgr->inst.addr.type = 0; + get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce)); + encode_my_addr(msgr); + + dout("messenger_create %p\n", msgr); + return msgr; +} + +void ceph_messenger_destroy(struct ceph_messenger *msgr) +{ + dout("destroy %p\n", msgr); + kunmap(msgr->zero_page); + __free_page(msgr->zero_page); + kfree(msgr); + dout("destroyed messenger %p\n", msgr); +} + +/* + * Queue up an outgoing message on the given connection. + */ +void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg) +{ + if (test_bit(CLOSED, &con->state)) { + dout("con_send %p closed, dropping %p\n", con, msg); + ceph_msg_put(msg); + return; + } + + /* set src+dst */ + msg->hdr.src.name = con->msgr->inst.name; + msg->hdr.src.addr = con->msgr->my_enc_addr; + msg->hdr.orig_src = msg->hdr.src; + + BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len)); + + /* queue */ + mutex_lock(&con->mutex); + BUG_ON(!list_empty(&msg->list_head)); + list_add_tail(&msg->list_head, &con->out_queue); + dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg, + ENTITY_NAME(con->peer_name), le16_to_cpu(msg->hdr.type), + ceph_msg_type_name(le16_to_cpu(msg->hdr.type)), + le32_to_cpu(msg->hdr.front_len), + le32_to_cpu(msg->hdr.middle_len), + le32_to_cpu(msg->hdr.data_len)); + mutex_unlock(&con->mutex); + + /* if there wasn't anything waiting to send before, queue + * new work */ + if (test_and_set_bit(WRITE_PENDING, &con->state) == 0) + queue_con(con); +} + +/* + * Revoke a message that was previously queued for send + */ +void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg) +{ + mutex_lock(&con->mutex); + if (!list_empty(&msg->list_head)) { + dout("con_revoke %p msg %p\n", con, msg); + list_del_init(&msg->list_head); + ceph_msg_put(msg); + msg->hdr.seq = 0; + if (con->out_msg == msg) { + ceph_msg_put(con->out_msg); + con->out_msg = NULL; + } + if (con->out_kvec_is_msg) { + con->out_skip = con->out_kvec_bytes; + con->out_kvec_is_msg = false; + } + } else { + dout("con_revoke %p msg %p - not queued (sent?)\n", con, msg); + } + mutex_unlock(&con->mutex); +} + +/* + * Revoke a message that we may be reading data into + */ +void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg) +{ + mutex_lock(&con->mutex); + if (con->in_msg && con->in_msg == msg) { + unsigned front_len = le32_to_cpu(con->in_hdr.front_len); + unsigned middle_len = le32_to_cpu(con->in_hdr.middle_len); + unsigned data_len = le32_to_cpu(con->in_hdr.data_len); + + /* skip rest of message */ + dout("con_revoke_pages %p msg %p revoked\n", con, msg); + con->in_base_pos = con->in_base_pos - + sizeof(struct ceph_msg_header) - + front_len - + middle_len - + data_len - + sizeof(struct ceph_msg_footer); + ceph_msg_put(con->in_msg); + con->in_msg = NULL; + con->in_tag = CEPH_MSGR_TAG_READY; + } else { + dout("con_revoke_pages %p msg %p pages %p no-op\n", + con, con->in_msg, msg); + } + mutex_unlock(&con->mutex); +} + +/* + * Queue a keepalive byte to ensure the tcp connection is alive. + */ +void ceph_con_keepalive(struct ceph_connection *con) +{ + if (test_and_set_bit(KEEPALIVE_PENDING, &con->state) == 0 && + test_and_set_bit(WRITE_PENDING, &con->state) == 0) + queue_con(con); +} + + +/* + * construct a new message with given type, size + * the new msg has a ref count of 1. + */ +struct ceph_msg *ceph_msg_new(int type, int front_len, + int page_len, int page_off, struct page **pages) +{ + struct ceph_msg *m; + + m = kmalloc(sizeof(*m), GFP_NOFS); + if (m == NULL) + goto out; + kref_init(&m->kref); + INIT_LIST_HEAD(&m->list_head); + + m->hdr.type = cpu_to_le16(type); + m->hdr.front_len = cpu_to_le32(front_len); + m->hdr.middle_len = 0; + m->hdr.data_len = cpu_to_le32(page_len); + m->hdr.data_off = cpu_to_le16(page_off); + m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT); + m->footer.front_crc = 0; + m->footer.middle_crc = 0; + m->footer.data_crc = 0; + m->front_max = front_len; + m->front_is_vmalloc = false; + m->more_to_follow = false; + m->pool = NULL; + + /* front */ + if (front_len) { + if (front_len > PAGE_CACHE_SIZE) { + m->front.iov_base = __vmalloc(front_len, GFP_NOFS, + PAGE_KERNEL); + m->front_is_vmalloc = true; + } else { + m->front.iov_base = kmalloc(front_len, GFP_NOFS); + } + if (m->front.iov_base == NULL) { + pr_err("msg_new can't allocate %d bytes\n", + front_len); + goto out2; + } + } else { + m->front.iov_base = NULL; + } + m->front.iov_len = front_len; + + /* middle */ + m->middle = NULL; + + /* data */ + m->nr_pages = calc_pages_for(page_off, page_len); + m->pages = pages; + m->pagelist = NULL; + + dout("ceph_msg_new %p page %d~%d -> %d\n", m, page_off, page_len, + m->nr_pages); + return m; + +out2: + ceph_msg_put(m); +out: + pr_err("msg_new can't create type %d len %d\n", type, front_len); + return ERR_PTR(-ENOMEM); +} + +/* + * Allocate "middle" portion of a message, if it is needed and wasn't + * allocated by alloc_msg. This allows us to read a small fixed-size + * per-type header in the front and then gracefully fail (i.e., + * propagate the error to the caller based on info in the front) when + * the middle is too large. + */ +static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg) +{ + int type = le16_to_cpu(msg->hdr.type); + int middle_len = le32_to_cpu(msg->hdr.middle_len); + + dout("alloc_middle %p type %d %s middle_len %d\n", msg, type, + ceph_msg_type_name(type), middle_len); + BUG_ON(!middle_len); + BUG_ON(msg->middle); + + msg->middle = ceph_buffer_new(middle_len, GFP_NOFS); + if (!msg->middle) + return -ENOMEM; + return 0; +} + +/* + * Generic message allocator, for incoming messages. + */ +static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con, + struct ceph_msg_header *hdr, + int *skip) +{ + int type = le16_to_cpu(hdr->type); + int front_len = le32_to_cpu(hdr->front_len); + int middle_len = le32_to_cpu(hdr->middle_len); + struct ceph_msg *msg = NULL; + int ret; + + if (con->ops->alloc_msg) { + mutex_unlock(&con->mutex); + msg = con->ops->alloc_msg(con, hdr, skip); + mutex_lock(&con->mutex); + if (IS_ERR(msg)) + return msg; + + if (*skip) + return NULL; + } + if (!msg) { + *skip = 0; + msg = ceph_msg_new(type, front_len, 0, 0, NULL); + if (!msg) { + pr_err("unable to allocate msg type %d len %d\n", + type, front_len); + return ERR_PTR(-ENOMEM); + } + } + memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr)); + + if (middle_len) { + ret = ceph_alloc_middle(con, msg); + + if (ret < 0) { + ceph_msg_put(msg); + return msg; + } + } + + return msg; +} + + +/* + * Free a generically kmalloc'd message. + */ +void ceph_msg_kfree(struct ceph_msg *m) +{ + dout("msg_kfree %p\n", m); + if (m->front_is_vmalloc) + vfree(m->front.iov_base); + else + kfree(m->front.iov_base); + kfree(m); +} + +/* + * Drop a msg ref. Destroy as needed. + */ +void ceph_msg_last_put(struct kref *kref) +{ + struct ceph_msg *m = container_of(kref, struct ceph_msg, kref); + + dout("ceph_msg_put last one on %p\n", m); + WARN_ON(!list_empty(&m->list_head)); + + /* drop middle, data, if any */ + if (m->middle) { + ceph_buffer_put(m->middle); + m->middle = NULL; + } + m->nr_pages = 0; + m->pages = NULL; + + if (m->pagelist) { + ceph_pagelist_release(m->pagelist); + kfree(m->pagelist); + m->pagelist = NULL; + } + + if (m->pool) + ceph_msgpool_put(m->pool, m); + else + ceph_msg_kfree(m); +} + +void ceph_msg_dump(struct ceph_msg *msg) +{ + pr_debug("msg_dump %p (front_max %d nr_pages %d)\n", msg, + msg->front_max, msg->nr_pages); + print_hex_dump(KERN_DEBUG, "header: ", + DUMP_PREFIX_OFFSET, 16, 1, + &msg->hdr, sizeof(msg->hdr), true); + print_hex_dump(KERN_DEBUG, " front: ", + DUMP_PREFIX_OFFSET, 16, 1, + msg->front.iov_base, msg->front.iov_len, true); + if (msg->middle) + print_hex_dump(KERN_DEBUG, "middle: ", + DUMP_PREFIX_OFFSET, 16, 1, + msg->middle->vec.iov_base, + msg->middle->vec.iov_len, true); + print_hex_dump(KERN_DEBUG, "footer: ", + DUMP_PREFIX_OFFSET, 16, 1, + &msg->footer, sizeof(msg->footer), true); +} diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h new file mode 100644 index 00000000000..4caaa591111 --- /dev/null +++ b/fs/ceph/messenger.h @@ -0,0 +1,254 @@ +#ifndef __FS_CEPH_MESSENGER_H +#define __FS_CEPH_MESSENGER_H + +#include <linux/kref.h> +#include <linux/mutex.h> +#include <linux/net.h> +#include <linux/radix-tree.h> +#include <linux/uio.h> +#include <linux/version.h> +#include <linux/workqueue.h> + +#include "types.h" +#include "buffer.h" + +struct ceph_msg; +struct ceph_connection; + +extern struct workqueue_struct *ceph_msgr_wq; /* receive work queue */ + +/* + * Ceph defines these callbacks for handling connection events. + */ +struct ceph_connection_operations { + struct ceph_connection *(*get)(struct ceph_connection *); + void (*put)(struct ceph_connection *); + + /* handle an incoming message. */ + void (*dispatch) (struct ceph_connection *con, struct ceph_msg *m); + + /* authorize an outgoing connection */ + int (*get_authorizer) (struct ceph_connection *con, + void **buf, int *len, int *proto, + void **reply_buf, int *reply_len, int force_new); + int (*verify_authorizer_reply) (struct ceph_connection *con, int len); + int (*invalidate_authorizer)(struct ceph_connection *con); + + /* protocol version mismatch */ + void (*bad_proto) (struct ceph_connection *con); + + /* there was some error on the socket (disconnect, whatever) */ + void (*fault) (struct ceph_connection *con); + + /* a remote host as terminated a message exchange session, and messages + * we sent (or they tried to send us) may be lost. */ + void (*peer_reset) (struct ceph_connection *con); + + struct ceph_msg * (*alloc_msg) (struct ceph_connection *con, + struct ceph_msg_header *hdr, + int *skip); +}; + +extern const char *ceph_name_type_str(int t); + +/* use format string %s%d */ +#define ENTITY_NAME(n) ceph_name_type_str((n).type), le64_to_cpu((n).num) + +struct ceph_messenger { + struct ceph_entity_inst inst; /* my name+address */ + struct ceph_entity_addr my_enc_addr; + struct page *zero_page; /* used in certain error cases */ + + bool nocrc; + + /* + * the global_seq counts connections i (attempt to) initiate + * in order to disambiguate certain connect race conditions. + */ + u32 global_seq; + spinlock_t global_seq_lock; +}; + +/* + * a single message. it contains a header (src, dest, message type, etc.), + * footer (crc values, mainly), a "front" message body, and possibly a + * data payload (stored in some number of pages). + */ +struct ceph_msg { + struct ceph_msg_header hdr; /* header */ + struct ceph_msg_footer footer; /* footer */ + struct kvec front; /* unaligned blobs of message */ + struct ceph_buffer *middle; + struct page **pages; /* data payload. NOT OWNER. */ + unsigned nr_pages; /* size of page array */ + struct ceph_pagelist *pagelist; /* instead of pages */ + struct list_head list_head; + struct kref kref; + bool front_is_vmalloc; + bool more_to_follow; + int front_max; + + struct ceph_msgpool *pool; +}; + +struct ceph_msg_pos { + int page, page_pos; /* which page; offset in page */ + int data_pos; /* offset in data payload */ + int did_page_crc; /* true if we've calculated crc for current page */ +}; + +/* ceph connection fault delay defaults, for exponential backoff */ +#define BASE_DELAY_INTERVAL (HZ/2) +#define MAX_DELAY_INTERVAL (5 * 60 * HZ) + +/* + * ceph_connection state bit flags + * + * QUEUED and BUSY are used together to ensure that only a single + * thread is currently opening, reading or writing data to the socket. + */ +#define LOSSYTX 0 /* we can close channel or drop messages on errors */ +#define CONNECTING 1 +#define NEGOTIATING 2 +#define KEEPALIVE_PENDING 3 +#define WRITE_PENDING 4 /* we have data ready to send */ +#define QUEUED 5 /* there is work queued on this connection */ +#define BUSY 6 /* work is being done */ +#define STANDBY 8 /* no outgoing messages, socket closed. we keep + * the ceph_connection around to maintain shared + * state with the peer. */ +#define CLOSED 10 /* we've closed the connection */ +#define SOCK_CLOSED 11 /* socket state changed to closed */ +#define OPENING 13 /* open connection w/ (possibly new) peer */ +#define DEAD 14 /* dead, about to kfree */ + +/* + * A single connection with another host. + * + * We maintain a queue of outgoing messages, and some session state to + * ensure that we can preserve the lossless, ordered delivery of + * messages in the case of a TCP disconnect. + */ +struct ceph_connection { + void *private; + atomic_t nref; + + const struct ceph_connection_operations *ops; + + struct ceph_messenger *msgr; + struct socket *sock; + unsigned long state; /* connection state (see flags above) */ + const char *error_msg; /* error message, if any */ + + struct ceph_entity_addr peer_addr; /* peer address */ + struct ceph_entity_name peer_name; /* peer name */ + struct ceph_entity_addr peer_addr_for_me; + u32 connect_seq; /* identify the most recent connection + attempt for this connection, client */ + u32 peer_global_seq; /* peer's global seq for this connection */ + + int auth_retry; /* true if we need a newer authorizer */ + void *auth_reply_buf; /* where to put the authorizer reply */ + int auth_reply_buf_len; + + struct mutex mutex; + + /* out queue */ + struct list_head out_queue; + struct list_head out_sent; /* sending or sent but unacked */ + u64 out_seq; /* last message queued for send */ + u64 out_seq_sent; /* last message sent */ + bool out_keepalive_pending; + + u64 in_seq, in_seq_acked; /* last message received, acked */ + + /* connection negotiation temps */ + char in_banner[CEPH_BANNER_MAX_LEN]; + union { + struct { /* outgoing connection */ + struct ceph_msg_connect out_connect; + struct ceph_msg_connect_reply in_reply; + }; + struct { /* incoming */ + struct ceph_msg_connect in_connect; + struct ceph_msg_connect_reply out_reply; + }; + }; + struct ceph_entity_addr actual_peer_addr; + + /* message out temps */ + struct ceph_msg *out_msg; /* sending message (== tail of + out_sent) */ + bool out_msg_done; + struct ceph_msg_pos out_msg_pos; + + struct kvec out_kvec[8], /* sending header/footer data */ + *out_kvec_cur; + int out_kvec_left; /* kvec's left in out_kvec */ + int out_skip; /* skip this many bytes */ + int out_kvec_bytes; /* total bytes left */ + bool out_kvec_is_msg; /* kvec refers to out_msg */ + int out_more; /* there is more data after the kvecs */ + __le64 out_temp_ack; /* for writing an ack */ + + /* message in temps */ + struct ceph_msg_header in_hdr; + struct ceph_msg *in_msg; + struct ceph_msg_pos in_msg_pos; + u32 in_front_crc, in_middle_crc, in_data_crc; /* calculated crc */ + + char in_tag; /* protocol control byte */ + int in_base_pos; /* bytes read */ + __le64 in_temp_ack; /* for reading an ack */ + + struct delayed_work work; /* send|recv work */ + unsigned long delay; /* current delay interval */ +}; + + +extern const char *pr_addr(const struct sockaddr_storage *ss); +extern int ceph_parse_ips(const char *c, const char *end, + struct ceph_entity_addr *addr, + int max_count, int *count); + + +extern int ceph_msgr_init(void); +extern void ceph_msgr_exit(void); + +extern struct ceph_messenger *ceph_messenger_create( + struct ceph_entity_addr *myaddr); +extern void ceph_messenger_destroy(struct ceph_messenger *); + +extern void ceph_con_init(struct ceph_messenger *msgr, + struct ceph_connection *con); +extern void ceph_con_open(struct ceph_connection *con, + struct ceph_entity_addr *addr); +extern void ceph_con_close(struct ceph_connection *con); +extern void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg); +extern void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg); +extern void ceph_con_revoke_message(struct ceph_connection *con, + struct ceph_msg *msg); +extern void ceph_con_keepalive(struct ceph_connection *con); +extern struct ceph_connection *ceph_con_get(struct ceph_connection *con); +extern void ceph_con_put(struct ceph_connection *con); + +extern struct ceph_msg *ceph_msg_new(int type, int front_len, + int page_len, int page_off, + struct page **pages); +extern void ceph_msg_kfree(struct ceph_msg *m); + + +static inline struct ceph_msg *ceph_msg_get(struct ceph_msg *msg) +{ + kref_get(&msg->kref); + return msg; +} +extern void ceph_msg_last_put(struct kref *kref); +static inline void ceph_msg_put(struct ceph_msg *msg) +{ + kref_put(&msg->kref, ceph_msg_last_put); +} + +extern void ceph_msg_dump(struct ceph_msg *msg); + +#endif diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c new file mode 100644 index 00000000000..890597c09d4 --- /dev/null +++ b/fs/ceph/mon_client.c @@ -0,0 +1,834 @@ +#include "ceph_debug.h" + +#include <linux/types.h> +#include <linux/random.h> +#include <linux/sched.h> + +#include "mon_client.h" +#include "super.h" +#include "auth.h" +#include "decode.h" + +/* + * Interact with Ceph monitor cluster. Handle requests for new map + * versions, and periodically resend as needed. Also implement + * statfs() and umount(). + * + * A small cluster of Ceph "monitors" are responsible for managing critical + * cluster configuration and state information. An odd number (e.g., 3, 5) + * of cmon daemons use a modified version of the Paxos part-time parliament + * algorithm to manage the MDS map (mds cluster membership), OSD map, and + * list of clients who have mounted the file system. + * + * We maintain an open, active session with a monitor at all times in order to + * receive timely MDSMap updates. We periodically send a keepalive byte on the + * TCP socket to ensure we detect a failure. If the connection does break, we + * randomly hunt for a new monitor. Once the connection is reestablished, we + * resend any outstanding requests. + */ + +const static struct ceph_connection_operations mon_con_ops; + +static int __validate_auth(struct ceph_mon_client *monc); + +/* + * Decode a monmap blob (e.g., during mount). + */ +struct ceph_monmap *ceph_monmap_decode(void *p, void *end) +{ + struct ceph_monmap *m = NULL; + int i, err = -EINVAL; + struct ceph_fsid fsid; + u32 epoch, num_mon; + u16 version; + u32 len; + + ceph_decode_32_safe(&p, end, len, bad); + ceph_decode_need(&p, end, len, bad); + + dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p)); + + ceph_decode_16_safe(&p, end, version, bad); + + ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad); + ceph_decode_copy(&p, &fsid, sizeof(fsid)); + epoch = ceph_decode_32(&p); + + num_mon = ceph_decode_32(&p); + ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad); + + if (num_mon >= CEPH_MAX_MON) + goto bad; + m = kmalloc(sizeof(*m) + sizeof(m->mon_inst[0])*num_mon, GFP_NOFS); + if (m == NULL) + return ERR_PTR(-ENOMEM); + m->fsid = fsid; + m->epoch = epoch; + m->num_mon = num_mon; + ceph_decode_copy(&p, m->mon_inst, num_mon*sizeof(m->mon_inst[0])); + for (i = 0; i < num_mon; i++) + ceph_decode_addr(&m->mon_inst[i].addr); + + dout("monmap_decode epoch %d, num_mon %d\n", m->epoch, + m->num_mon); + for (i = 0; i < m->num_mon; i++) + dout("monmap_decode mon%d is %s\n", i, + pr_addr(&m->mon_inst[i].addr.in_addr)); + return m; + +bad: + dout("monmap_decode failed with %d\n", err); + kfree(m); + return ERR_PTR(err); +} + +/* + * return true if *addr is included in the monmap. + */ +int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr) +{ + int i; + + for (i = 0; i < m->num_mon; i++) + if (memcmp(addr, &m->mon_inst[i].addr, sizeof(*addr)) == 0) + return 1; + return 0; +} + +/* + * Send an auth request. + */ +static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len) +{ + monc->pending_auth = 1; + monc->m_auth->front.iov_len = len; + monc->m_auth->hdr.front_len = cpu_to_le32(len); + ceph_msg_get(monc->m_auth); /* keep our ref */ + ceph_con_send(monc->con, monc->m_auth); +} + +/* + * Close monitor session, if any. + */ +static void __close_session(struct ceph_mon_client *monc) +{ + if (monc->con) { + dout("__close_session closing mon%d\n", monc->cur_mon); + ceph_con_revoke(monc->con, monc->m_auth); + ceph_con_close(monc->con); + monc->cur_mon = -1; + monc->pending_auth = 0; + ceph_auth_reset(monc->auth); + } +} + +/* + * Open a session with a (new) monitor. + */ +static int __open_session(struct ceph_mon_client *monc) +{ + char r; + int ret; + + if (monc->cur_mon < 0) { + get_random_bytes(&r, 1); + monc->cur_mon = r % monc->monmap->num_mon; + dout("open_session num=%d r=%d -> mon%d\n", + monc->monmap->num_mon, r, monc->cur_mon); + monc->sub_sent = 0; + monc->sub_renew_after = jiffies; /* i.e., expired */ + monc->want_next_osdmap = !!monc->want_next_osdmap; + + dout("open_session mon%d opening\n", monc->cur_mon); + monc->con->peer_name.type = CEPH_ENTITY_TYPE_MON; + monc->con->peer_name.num = cpu_to_le64(monc->cur_mon); + ceph_con_open(monc->con, + &monc->monmap->mon_inst[monc->cur_mon].addr); + + /* initiatiate authentication handshake */ + ret = ceph_auth_build_hello(monc->auth, + monc->m_auth->front.iov_base, + monc->m_auth->front_max); + __send_prepared_auth_request(monc, ret); + } else { + dout("open_session mon%d already open\n", monc->cur_mon); + } + return 0; +} + +static bool __sub_expired(struct ceph_mon_client *monc) +{ + return time_after_eq(jiffies, monc->sub_renew_after); +} + +/* + * Reschedule delayed work timer. + */ +static void __schedule_delayed(struct ceph_mon_client *monc) +{ + unsigned delay; + + if (monc->cur_mon < 0 || __sub_expired(monc)) + delay = 10 * HZ; + else + delay = 20 * HZ; + dout("__schedule_delayed after %u\n", delay); + schedule_delayed_work(&monc->delayed_work, delay); +} + +/* + * Send subscribe request for mdsmap and/or osdmap. + */ +static void __send_subscribe(struct ceph_mon_client *monc) +{ + dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n", + (unsigned)monc->sub_sent, __sub_expired(monc), + monc->want_next_osdmap); + if ((__sub_expired(monc) && !monc->sub_sent) || + monc->want_next_osdmap == 1) { + struct ceph_msg *msg; + struct ceph_mon_subscribe_item *i; + void *p, *end; + + msg = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, 0, 0, NULL); + if (!msg) + return; + + p = msg->front.iov_base; + end = p + msg->front.iov_len; + + dout("__send_subscribe to 'mdsmap' %u+\n", + (unsigned)monc->have_mdsmap); + if (monc->want_next_osdmap) { + dout("__send_subscribe to 'osdmap' %u\n", + (unsigned)monc->have_osdmap); + ceph_encode_32(&p, 3); + ceph_encode_string(&p, end, "osdmap", 6); + i = p; + i->have = cpu_to_le64(monc->have_osdmap); + i->onetime = 1; + p += sizeof(*i); + monc->want_next_osdmap = 2; /* requested */ + } else { + ceph_encode_32(&p, 2); + } + ceph_encode_string(&p, end, "mdsmap", 6); + i = p; + i->have = cpu_to_le64(monc->have_mdsmap); + i->onetime = 0; + p += sizeof(*i); + ceph_encode_string(&p, end, "monmap", 6); + i = p; + i->have = 0; + i->onetime = 0; + p += sizeof(*i); + + msg->front.iov_len = p - msg->front.iov_base; + msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); + ceph_con_send(monc->con, msg); + + monc->sub_sent = jiffies | 1; /* never 0 */ + } +} + +static void handle_subscribe_ack(struct ceph_mon_client *monc, + struct ceph_msg *msg) +{ + unsigned seconds; + struct ceph_mon_subscribe_ack *h = msg->front.iov_base; + + if (msg->front.iov_len < sizeof(*h)) + goto bad; + seconds = le32_to_cpu(h->duration); + + mutex_lock(&monc->mutex); + if (monc->hunting) { + pr_info("mon%d %s session established\n", + monc->cur_mon, pr_addr(&monc->con->peer_addr.in_addr)); + monc->hunting = false; + } + dout("handle_subscribe_ack after %d seconds\n", seconds); + monc->sub_renew_after = monc->sub_sent + (seconds >> 1)*HZ - 1; + monc->sub_sent = 0; + mutex_unlock(&monc->mutex); + return; +bad: + pr_err("got corrupt subscribe-ack msg\n"); + ceph_msg_dump(msg); +} + +/* + * Keep track of which maps we have + */ +int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got) +{ + mutex_lock(&monc->mutex); + monc->have_mdsmap = got; + mutex_unlock(&monc->mutex); + return 0; +} + +int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got) +{ + mutex_lock(&monc->mutex); + monc->have_osdmap = got; + monc->want_next_osdmap = 0; + mutex_unlock(&monc->mutex); + return 0; +} + +/* + * Register interest in the next osdmap + */ +void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc) +{ + dout("request_next_osdmap have %u\n", monc->have_osdmap); + mutex_lock(&monc->mutex); + if (!monc->want_next_osdmap) + monc->want_next_osdmap = 1; + if (monc->want_next_osdmap < 2) + __send_subscribe(monc); + mutex_unlock(&monc->mutex); +} + +/* + * + */ +int ceph_monc_open_session(struct ceph_mon_client *monc) +{ + if (!monc->con) { + monc->con = kmalloc(sizeof(*monc->con), GFP_KERNEL); + if (!monc->con) + return -ENOMEM; + ceph_con_init(monc->client->msgr, monc->con); + monc->con->private = monc; + monc->con->ops = &mon_con_ops; + } + + mutex_lock(&monc->mutex); + __open_session(monc); + __schedule_delayed(monc); + mutex_unlock(&monc->mutex); + return 0; +} + +/* + * The monitor responds with mount ack indicate mount success. The + * included client ticket allows the client to talk to MDSs and OSDs. + */ +static void ceph_monc_handle_map(struct ceph_mon_client *monc, + struct ceph_msg *msg) +{ + struct ceph_client *client = monc->client; + struct ceph_monmap *monmap = NULL, *old = monc->monmap; + void *p, *end; + + mutex_lock(&monc->mutex); + + dout("handle_monmap\n"); + p = msg->front.iov_base; + end = p + msg->front.iov_len; + + monmap = ceph_monmap_decode(p, end); + if (IS_ERR(monmap)) { + pr_err("problem decoding monmap, %d\n", + (int)PTR_ERR(monmap)); + goto out; + } + + if (ceph_check_fsid(monc->client, &monmap->fsid) < 0) { + kfree(monmap); + goto out; + } + + client->monc.monmap = monmap; + kfree(old); + +out: + mutex_unlock(&monc->mutex); + wake_up(&client->auth_wq); +} + +/* + * statfs + */ +static struct ceph_mon_statfs_request *__lookup_statfs( + struct ceph_mon_client *monc, u64 tid) +{ + struct ceph_mon_statfs_request *req; + struct rb_node *n = monc->statfs_request_tree.rb_node; + + while (n) { + req = rb_entry(n, struct ceph_mon_statfs_request, node); + if (tid < req->tid) + n = n->rb_left; + else if (tid > req->tid) + n = n->rb_right; + else + return req; + } + return NULL; +} + +static void __insert_statfs(struct ceph_mon_client *monc, + struct ceph_mon_statfs_request *new) +{ + struct rb_node **p = &monc->statfs_request_tree.rb_node; + struct rb_node *parent = NULL; + struct ceph_mon_statfs_request *req = NULL; + + while (*p) { + parent = *p; + req = rb_entry(parent, struct ceph_mon_statfs_request, node); + if (new->tid < req->tid) + p = &(*p)->rb_left; + else if (new->tid > req->tid) + p = &(*p)->rb_right; + else + BUG(); + } + + rb_link_node(&new->node, parent, p); + rb_insert_color(&new->node, &monc->statfs_request_tree); +} + +static void handle_statfs_reply(struct ceph_mon_client *monc, + struct ceph_msg *msg) +{ + struct ceph_mon_statfs_request *req; + struct ceph_mon_statfs_reply *reply = msg->front.iov_base; + u64 tid; + + if (msg->front.iov_len != sizeof(*reply)) + goto bad; + tid = le64_to_cpu(msg->hdr.tid); + dout("handle_statfs_reply %p tid %llu\n", msg, tid); + + mutex_lock(&monc->mutex); + req = __lookup_statfs(monc, tid); + if (req) { + *req->buf = reply->st; + req->result = 0; + } + mutex_unlock(&monc->mutex); + if (req) + complete(&req->completion); + return; + +bad: + pr_err("corrupt statfs reply, no tid\n"); + ceph_msg_dump(msg); +} + +/* + * (re)send a statfs request + */ +static int send_statfs(struct ceph_mon_client *monc, + struct ceph_mon_statfs_request *req) +{ + struct ceph_msg *msg; + struct ceph_mon_statfs *h; + + dout("send_statfs tid %llu\n", req->tid); + msg = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), 0, 0, NULL); + if (IS_ERR(msg)) + return PTR_ERR(msg); + req->request = msg; + msg->hdr.tid = cpu_to_le64(req->tid); + h = msg->front.iov_base; + h->monhdr.have_version = 0; + h->monhdr.session_mon = cpu_to_le16(-1); + h->monhdr.session_mon_tid = 0; + h->fsid = monc->monmap->fsid; + ceph_con_send(monc->con, msg); + return 0; +} + +/* + * Do a synchronous statfs(). + */ +int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf) +{ + struct ceph_mon_statfs_request req; + int err; + + req.buf = buf; + init_completion(&req.completion); + + /* allocate memory for reply */ + err = ceph_msgpool_resv(&monc->msgpool_statfs_reply, 1); + if (err) + return err; + + /* register request */ + mutex_lock(&monc->mutex); + req.tid = ++monc->last_tid; + req.last_attempt = jiffies; + req.delay = BASE_DELAY_INTERVAL; + __insert_statfs(monc, &req); + monc->num_statfs_requests++; + mutex_unlock(&monc->mutex); + + /* send request and wait */ + err = send_statfs(monc, &req); + if (!err) + err = wait_for_completion_interruptible(&req.completion); + + mutex_lock(&monc->mutex); + rb_erase(&req.node, &monc->statfs_request_tree); + monc->num_statfs_requests--; + ceph_msgpool_resv(&monc->msgpool_statfs_reply, -1); + mutex_unlock(&monc->mutex); + + if (!err) + err = req.result; + return err; +} + +/* + * Resend pending statfs requests. + */ +static void __resend_statfs(struct ceph_mon_client *monc) +{ + struct ceph_mon_statfs_request *req; + struct rb_node *p; + + for (p = rb_first(&monc->statfs_request_tree); p; p = rb_next(p)) { + req = rb_entry(p, struct ceph_mon_statfs_request, node); + send_statfs(monc, req); + } +} + +/* + * Delayed work. If we haven't mounted yet, retry. Otherwise, + * renew/retry subscription as needed (in case it is timing out, or we + * got an ENOMEM). And keep the monitor connection alive. + */ +static void delayed_work(struct work_struct *work) +{ + struct ceph_mon_client *monc = + container_of(work, struct ceph_mon_client, delayed_work.work); + + dout("monc delayed_work\n"); + mutex_lock(&monc->mutex); + if (monc->hunting) { + __close_session(monc); + __open_session(monc); /* continue hunting */ + } else { + ceph_con_keepalive(monc->con); + + __validate_auth(monc); + + if (monc->auth->ops->is_authenticated(monc->auth)) + __send_subscribe(monc); + } + __schedule_delayed(monc); + mutex_unlock(&monc->mutex); +} + +/* + * On startup, we build a temporary monmap populated with the IPs + * provided by mount(2). + */ +static int build_initial_monmap(struct ceph_mon_client *monc) +{ + struct ceph_mount_args *args = monc->client->mount_args; + struct ceph_entity_addr *mon_addr = args->mon_addr; + int num_mon = args->num_mon; + int i; + + /* build initial monmap */ + monc->monmap = kzalloc(sizeof(*monc->monmap) + + num_mon*sizeof(monc->monmap->mon_inst[0]), + GFP_KERNEL); + if (!monc->monmap) + return -ENOMEM; + for (i = 0; i < num_mon; i++) { + monc->monmap->mon_inst[i].addr = mon_addr[i]; + monc->monmap->mon_inst[i].addr.nonce = 0; + monc->monmap->mon_inst[i].name.type = + CEPH_ENTITY_TYPE_MON; + monc->monmap->mon_inst[i].name.num = cpu_to_le64(i); + } + monc->monmap->num_mon = num_mon; + monc->have_fsid = false; + + /* release addr memory */ + kfree(args->mon_addr); + args->mon_addr = NULL; + args->num_mon = 0; + return 0; +} + +int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) +{ + int err = 0; + + dout("init\n"); + memset(monc, 0, sizeof(*monc)); + monc->client = cl; + monc->monmap = NULL; + mutex_init(&monc->mutex); + + err = build_initial_monmap(monc); + if (err) + goto out; + + monc->con = NULL; + + /* authentication */ + monc->auth = ceph_auth_init(cl->mount_args->name, + cl->mount_args->secret); + if (IS_ERR(monc->auth)) + return PTR_ERR(monc->auth); + monc->auth->want_keys = + CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON | + CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS; + + /* msg pools */ + err = ceph_msgpool_init(&monc->msgpool_subscribe_ack, + sizeof(struct ceph_mon_subscribe_ack), 1, false); + if (err < 0) + goto out_monmap; + err = ceph_msgpool_init(&monc->msgpool_statfs_reply, + sizeof(struct ceph_mon_statfs_reply), 0, false); + if (err < 0) + goto out_pool1; + err = ceph_msgpool_init(&monc->msgpool_auth_reply, 4096, 1, false); + if (err < 0) + goto out_pool2; + + monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, 0, 0, NULL); + monc->pending_auth = 0; + if (IS_ERR(monc->m_auth)) { + err = PTR_ERR(monc->m_auth); + monc->m_auth = NULL; + goto out_pool3; + } + + monc->cur_mon = -1; + monc->hunting = true; + monc->sub_renew_after = jiffies; + monc->sub_sent = 0; + + INIT_DELAYED_WORK(&monc->delayed_work, delayed_work); + monc->statfs_request_tree = RB_ROOT; + monc->num_statfs_requests = 0; + monc->last_tid = 0; + + monc->have_mdsmap = 0; + monc->have_osdmap = 0; + monc->want_next_osdmap = 1; + return 0; + +out_pool3: + ceph_msgpool_destroy(&monc->msgpool_auth_reply); +out_pool2: + ceph_msgpool_destroy(&monc->msgpool_subscribe_ack); +out_pool1: + ceph_msgpool_destroy(&monc->msgpool_statfs_reply); +out_monmap: + kfree(monc->monmap); +out: + return err; +} + +void ceph_monc_stop(struct ceph_mon_client *monc) +{ + dout("stop\n"); + cancel_delayed_work_sync(&monc->delayed_work); + + mutex_lock(&monc->mutex); + __close_session(monc); + if (monc->con) { + monc->con->private = NULL; + monc->con->ops->put(monc->con); + monc->con = NULL; + } + mutex_unlock(&monc->mutex); + + ceph_auth_destroy(monc->auth); + + ceph_msg_put(monc->m_auth); + ceph_msgpool_destroy(&monc->msgpool_subscribe_ack); + ceph_msgpool_destroy(&monc->msgpool_statfs_reply); + ceph_msgpool_destroy(&monc->msgpool_auth_reply); + + kfree(monc->monmap); +} + +static void handle_auth_reply(struct ceph_mon_client *monc, + struct ceph_msg *msg) +{ + int ret; + + mutex_lock(&monc->mutex); + monc->pending_auth = 0; + ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base, + msg->front.iov_len, + monc->m_auth->front.iov_base, + monc->m_auth->front_max); + if (ret < 0) { + monc->client->auth_err = ret; + wake_up(&monc->client->auth_wq); + } else if (ret > 0) { + __send_prepared_auth_request(monc, ret); + } else if (monc->auth->ops->is_authenticated(monc->auth)) { + dout("authenticated, starting session\n"); + + monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT; + monc->client->msgr->inst.name.num = monc->auth->global_id; + + __send_subscribe(monc); + __resend_statfs(monc); + } + mutex_unlock(&monc->mutex); +} + +static int __validate_auth(struct ceph_mon_client *monc) +{ + int ret; + + if (monc->pending_auth) + return 0; + + ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base, + monc->m_auth->front_max); + if (ret <= 0) + return ret; /* either an error, or no need to authenticate */ + __send_prepared_auth_request(monc, ret); + return 0; +} + +int ceph_monc_validate_auth(struct ceph_mon_client *monc) +{ + int ret; + + mutex_lock(&monc->mutex); + ret = __validate_auth(monc); + mutex_unlock(&monc->mutex); + return ret; +} + +/* + * handle incoming message + */ +static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) +{ + struct ceph_mon_client *monc = con->private; + int type = le16_to_cpu(msg->hdr.type); + + if (!monc) + return; + + switch (type) { + case CEPH_MSG_AUTH_REPLY: + handle_auth_reply(monc, msg); + break; + + case CEPH_MSG_MON_SUBSCRIBE_ACK: + handle_subscribe_ack(monc, msg); + break; + + case CEPH_MSG_STATFS_REPLY: + handle_statfs_reply(monc, msg); + break; + + case CEPH_MSG_MON_MAP: + ceph_monc_handle_map(monc, msg); + break; + + case CEPH_MSG_MDS_MAP: + ceph_mdsc_handle_map(&monc->client->mdsc, msg); + break; + + case CEPH_MSG_OSD_MAP: + ceph_osdc_handle_map(&monc->client->osdc, msg); + break; + + default: + pr_err("received unknown message type %d %s\n", type, + ceph_msg_type_name(type)); + } + ceph_msg_put(msg); +} + +/* + * Allocate memory for incoming message + */ +static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con, + struct ceph_msg_header *hdr, + int *skip) +{ + struct ceph_mon_client *monc = con->private; + int type = le16_to_cpu(hdr->type); + int front_len = le32_to_cpu(hdr->front_len); + struct ceph_msg *m = NULL; + + *skip = 0; + + switch (type) { + case CEPH_MSG_MON_SUBSCRIBE_ACK: + m = ceph_msgpool_get(&monc->msgpool_subscribe_ack, front_len); + break; + case CEPH_MSG_STATFS_REPLY: + m = ceph_msgpool_get(&monc->msgpool_statfs_reply, front_len); + break; + case CEPH_MSG_AUTH_REPLY: + m = ceph_msgpool_get(&monc->msgpool_auth_reply, front_len); + break; + case CEPH_MSG_MON_MAP: + case CEPH_MSG_MDS_MAP: + case CEPH_MSG_OSD_MAP: + m = ceph_msg_new(type, front_len, 0, 0, NULL); + break; + } + + if (!m) { + pr_info("alloc_msg unknown type %d\n", type); + *skip = 1; + } + return m; +} + +/* + * If the monitor connection resets, pick a new monitor and resubmit + * any pending requests. + */ +static void mon_fault(struct ceph_connection *con) +{ + struct ceph_mon_client *monc = con->private; + + if (!monc) + return; + + dout("mon_fault\n"); + mutex_lock(&monc->mutex); + if (!con->private) + goto out; + + if (monc->con && !monc->hunting) + pr_info("mon%d %s session lost, " + "hunting for new mon\n", monc->cur_mon, + pr_addr(&monc->con->peer_addr.in_addr)); + + __close_session(monc); + if (!monc->hunting) { + /* start hunting */ + monc->hunting = true; + __open_session(monc); + } else { + /* already hunting, let's wait a bit */ + __schedule_delayed(monc); + } +out: + mutex_unlock(&monc->mutex); +} + +const static struct ceph_connection_operations mon_con_ops = { + .get = ceph_con_get, + .put = ceph_con_put, + .dispatch = dispatch, + .fault = mon_fault, + .alloc_msg = mon_alloc_msg, +}; diff --git a/fs/ceph/mon_client.h b/fs/ceph/mon_client.h new file mode 100644 index 00000000000..b958ad5afa0 --- /dev/null +++ b/fs/ceph/mon_client.h @@ -0,0 +1,119 @@ +#ifndef _FS_CEPH_MON_CLIENT_H +#define _FS_CEPH_MON_CLIENT_H + +#include <linux/completion.h> +#include <linux/rbtree.h> + +#include "messenger.h" +#include "msgpool.h" + +struct ceph_client; +struct ceph_mount_args; +struct ceph_auth_client; + +/* + * The monitor map enumerates the set of all monitors. + */ +struct ceph_monmap { + struct ceph_fsid fsid; + u32 epoch; + u32 num_mon; + struct ceph_entity_inst mon_inst[0]; +}; + +struct ceph_mon_client; +struct ceph_mon_statfs_request; + + +/* + * Generic mechanism for resending monitor requests. + */ +typedef void (*ceph_monc_request_func_t)(struct ceph_mon_client *monc, + int newmon); + +/* a pending monitor request */ +struct ceph_mon_request { + struct ceph_mon_client *monc; + struct delayed_work delayed_work; + unsigned long delay; + ceph_monc_request_func_t do_request; +}; + +/* + * statfs() is done a bit differently because we need to get data back + * to the caller + */ +struct ceph_mon_statfs_request { + u64 tid; + struct rb_node node; + int result; + struct ceph_statfs *buf; + struct completion completion; + unsigned long last_attempt, delay; /* jiffies */ + struct ceph_msg *request; /* original request */ +}; + +struct ceph_mon_client { + struct ceph_client *client; + struct ceph_monmap *monmap; + + struct mutex mutex; + struct delayed_work delayed_work; + + struct ceph_auth_client *auth; + struct ceph_msg *m_auth; + int pending_auth; + + bool hunting; + int cur_mon; /* last monitor i contacted */ + unsigned long sub_sent, sub_renew_after; + struct ceph_connection *con; + bool have_fsid; + + /* msg pools */ + struct ceph_msgpool msgpool_subscribe_ack; + struct ceph_msgpool msgpool_statfs_reply; + struct ceph_msgpool msgpool_auth_reply; + + /* pending statfs requests */ + struct rb_root statfs_request_tree; + int num_statfs_requests; + u64 last_tid; + + /* mds/osd map */ + int want_next_osdmap; /* 1 = want, 2 = want+asked */ + u32 have_osdmap, have_mdsmap; + +#ifdef CONFIG_DEBUG_FS + struct dentry *debugfs_file; +#endif +}; + +extern struct ceph_monmap *ceph_monmap_decode(void *p, void *end); +extern int ceph_monmap_contains(struct ceph_monmap *m, + struct ceph_entity_addr *addr); + +extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl); +extern void ceph_monc_stop(struct ceph_mon_client *monc); + +/* + * The model here is to indicate that we need a new map of at least + * epoch @want, and also call in when we receive a map. We will + * periodically rerequest the map from the monitor cluster until we + * get what we want. + */ +extern int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 have); +extern int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 have); + +extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc); + +extern int ceph_monc_do_statfs(struct ceph_mon_client *monc, + struct ceph_statfs *buf); + +extern int ceph_monc_open_session(struct ceph_mon_client *monc); + +extern int ceph_monc_validate_auth(struct ceph_mon_client *monc); + + + +#endif diff --git a/fs/ceph/msgpool.c b/fs/ceph/msgpool.c new file mode 100644 index 00000000000..ca3b44a89f2 --- /dev/null +++ b/fs/ceph/msgpool.c @@ -0,0 +1,186 @@ +#include "ceph_debug.h" + +#include <linux/err.h> +#include <linux/sched.h> +#include <linux/types.h> +#include <linux/vmalloc.h> + +#include "msgpool.h" + +/* + * We use msg pools to preallocate memory for messages we expect to + * receive over the wire, to avoid getting ourselves into OOM + * conditions at unexpected times. We take use a few different + * strategies: + * + * - for request/response type interactions, we preallocate the + * memory needed for the response when we generate the request. + * + * - for messages we can receive at any time from the MDS, we preallocate + * a pool of messages we can re-use. + * + * - for writeback, we preallocate some number of messages to use for + * requests and their replies, so that we always make forward + * progress. + * + * The msgpool behaves like a mempool_t, but keeps preallocated + * ceph_msgs strung together on a list_head instead of using a pointer + * vector. This avoids vector reallocation when we adjust the number + * of preallocated items (which happens frequently). + */ + + +/* + * Allocate or release as necessary to meet our target pool size. + */ +static int __fill_msgpool(struct ceph_msgpool *pool) +{ + struct ceph_msg *msg; + + while (pool->num < pool->min) { + dout("fill_msgpool %p %d/%d allocating\n", pool, pool->num, + pool->min); + spin_unlock(&pool->lock); + msg = ceph_msg_new(0, pool->front_len, 0, 0, NULL); + spin_lock(&pool->lock); + if (IS_ERR(msg)) + return PTR_ERR(msg); + msg->pool = pool; + list_add(&msg->list_head, &pool->msgs); + pool->num++; + } + while (pool->num > pool->min) { + msg = list_first_entry(&pool->msgs, struct ceph_msg, list_head); + dout("fill_msgpool %p %d/%d releasing %p\n", pool, pool->num, + pool->min, msg); + list_del_init(&msg->list_head); + pool->num--; + ceph_msg_kfree(msg); + } + return 0; +} + +int ceph_msgpool_init(struct ceph_msgpool *pool, + int front_len, int min, bool blocking) +{ + int ret; + + dout("msgpool_init %p front_len %d min %d\n", pool, front_len, min); + spin_lock_init(&pool->lock); + pool->front_len = front_len; + INIT_LIST_HEAD(&pool->msgs); + pool->num = 0; + pool->min = min; + pool->blocking = blocking; + init_waitqueue_head(&pool->wait); + + spin_lock(&pool->lock); + ret = __fill_msgpool(pool); + spin_unlock(&pool->lock); + return ret; +} + +void ceph_msgpool_destroy(struct ceph_msgpool *pool) +{ + dout("msgpool_destroy %p\n", pool); + spin_lock(&pool->lock); + pool->min = 0; + __fill_msgpool(pool); + spin_unlock(&pool->lock); +} + +int ceph_msgpool_resv(struct ceph_msgpool *pool, int delta) +{ + int ret; + + spin_lock(&pool->lock); + dout("msgpool_resv %p delta %d\n", pool, delta); + pool->min += delta; + ret = __fill_msgpool(pool); + spin_unlock(&pool->lock); + return ret; +} + +struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, int front_len) +{ + wait_queue_t wait; + struct ceph_msg *msg; + + if (front_len && front_len > pool->front_len) { + pr_err("msgpool_get pool %p need front %d, pool size is %d\n", + pool, front_len, pool->front_len); + WARN_ON(1); + + /* try to alloc a fresh message */ + msg = ceph_msg_new(0, front_len, 0, 0, NULL); + if (!IS_ERR(msg)) + return msg; + } + + if (!front_len) + front_len = pool->front_len; + + if (pool->blocking) { + /* mempool_t behavior; first try to alloc */ + msg = ceph_msg_new(0, front_len, 0, 0, NULL); + if (!IS_ERR(msg)) + return msg; + } + + while (1) { + spin_lock(&pool->lock); + if (likely(pool->num)) { + msg = list_entry(pool->msgs.next, struct ceph_msg, + list_head); + list_del_init(&msg->list_head); + pool->num--; + dout("msgpool_get %p got %p, now %d/%d\n", pool, msg, + pool->num, pool->min); + spin_unlock(&pool->lock); + return msg; + } + pr_err("msgpool_get %p now %d/%d, %s\n", pool, pool->num, + pool->min, pool->blocking ? "waiting" : "may fail"); + spin_unlock(&pool->lock); + + if (!pool->blocking) { + WARN_ON(1); + + /* maybe we can allocate it now? */ + msg = ceph_msg_new(0, front_len, 0, 0, NULL); + if (!IS_ERR(msg)) + return msg; + + pr_err("msgpool_get %p empty + alloc failed\n", pool); + return ERR_PTR(-ENOMEM); + } + + init_wait(&wait); + prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE); + schedule(); + finish_wait(&pool->wait, &wait); + } +} + +void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg) +{ + spin_lock(&pool->lock); + if (pool->num < pool->min) { + /* reset msg front_len; user may have changed it */ + msg->front.iov_len = pool->front_len; + msg->hdr.front_len = cpu_to_le32(pool->front_len); + + kref_set(&msg->kref, 1); /* retake a single ref */ + list_add(&msg->list_head, &pool->msgs); + pool->num++; + dout("msgpool_put %p reclaim %p, now %d/%d\n", pool, msg, + pool->num, pool->min); + spin_unlock(&pool->lock); + wake_up(&pool->wait); + } else { + dout("msgpool_put %p drop %p, at %d/%d\n", pool, msg, + pool->num, pool->min); + spin_unlock(&pool->lock); + ceph_msg_kfree(msg); + } +} diff --git a/fs/ceph/msgpool.h b/fs/ceph/msgpool.h new file mode 100644 index 00000000000..bc834bfcd72 --- /dev/null +++ b/fs/ceph/msgpool.h @@ -0,0 +1,27 @@ +#ifndef _FS_CEPH_MSGPOOL +#define _FS_CEPH_MSGPOOL + +#include "messenger.h" + +/* + * we use memory pools for preallocating messages we may receive, to + * avoid unexpected OOM conditions. + */ +struct ceph_msgpool { + spinlock_t lock; + int front_len; /* preallocated payload size */ + struct list_head msgs; /* msgs in the pool; each has 1 ref */ + int num, min; /* cur, min # msgs in the pool */ + bool blocking; + wait_queue_head_t wait; +}; + +extern int ceph_msgpool_init(struct ceph_msgpool *pool, + int front_len, int size, bool blocking); +extern void ceph_msgpool_destroy(struct ceph_msgpool *pool); +extern int ceph_msgpool_resv(struct ceph_msgpool *, int delta); +extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *, + int front_len); +extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *); + +#endif diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h new file mode 100644 index 00000000000..8aaab414f3f --- /dev/null +++ b/fs/ceph/msgr.h @@ -0,0 +1,158 @@ +#ifndef __MSGR_H +#define __MSGR_H + +/* + * Data types for message passing layer used by Ceph. + */ + +#define CEPH_MON_PORT 6789 /* default monitor port */ + +/* + * client-side processes will try to bind to ports in this + * range, simply for the benefit of tools like nmap or wireshark + * that would like to identify the protocol. + */ +#define CEPH_PORT_FIRST 6789 +#define CEPH_PORT_START 6800 /* non-monitors start here */ +#define CEPH_PORT_LAST 6900 + +/* + * tcp connection banner. include a protocol version. and adjust + * whenever the wire protocol changes. try to keep this string length + * constant. + */ +#define CEPH_BANNER "ceph v027" +#define CEPH_BANNER_MAX_LEN 30 + + +/* + * Rollover-safe type and comparator for 32-bit sequence numbers. + * Comparator returns -1, 0, or 1. + */ +typedef __u32 ceph_seq_t; + +static inline __s32 ceph_seq_cmp(__u32 a, __u32 b) +{ + return (__s32)a - (__s32)b; +} + + +/* + * entity_name -- logical name for a process participating in the + * network, e.g. 'mds0' or 'osd3'. + */ +struct ceph_entity_name { + __u8 type; /* CEPH_ENTITY_TYPE_* */ + __le64 num; +} __attribute__ ((packed)); + +#define CEPH_ENTITY_TYPE_MON 0x01 +#define CEPH_ENTITY_TYPE_MDS 0x02 +#define CEPH_ENTITY_TYPE_OSD 0x04 +#define CEPH_ENTITY_TYPE_CLIENT 0x08 +#define CEPH_ENTITY_TYPE_ADMIN 0x10 +#define CEPH_ENTITY_TYPE_AUTH 0x20 + +#define CEPH_ENTITY_TYPE_ANY 0xFF + +extern const char *ceph_entity_type_name(int type); + +/* + * entity_addr -- network address + */ +struct ceph_entity_addr { + __le32 type; + __le32 nonce; /* unique id for process (e.g. pid) */ + struct sockaddr_storage in_addr; +} __attribute__ ((packed)); + +struct ceph_entity_inst { + struct ceph_entity_name name; + struct ceph_entity_addr addr; +} __attribute__ ((packed)); + + +/* used by message exchange protocol */ +#define CEPH_MSGR_TAG_READY 1 /* server->client: ready for messages */ +#define CEPH_MSGR_TAG_RESETSESSION 2 /* server->client: reset, try again */ +#define CEPH_MSGR_TAG_WAIT 3 /* server->client: wait for racing + incoming connection */ +#define CEPH_MSGR_TAG_RETRY_SESSION 4 /* server->client + cseq: try again + with higher cseq */ +#define CEPH_MSGR_TAG_RETRY_GLOBAL 5 /* server->client + gseq: try again + with higher gseq */ +#define CEPH_MSGR_TAG_CLOSE 6 /* closing pipe */ +#define CEPH_MSGR_TAG_MSG 7 /* message */ +#define CEPH_MSGR_TAG_ACK 8 /* message ack */ +#define CEPH_MSGR_TAG_KEEPALIVE 9 /* just a keepalive byte! */ +#define CEPH_MSGR_TAG_BADPROTOVER 10 /* bad protocol version */ +#define CEPH_MSGR_TAG_BADAUTHORIZER 11 /* bad authorizer */ +#define CEPH_MSGR_TAG_FEATURES 12 /* insufficient features */ + + +/* + * connection negotiation + */ +struct ceph_msg_connect { + __le64 features; /* supported feature bits */ + __le32 host_type; /* CEPH_ENTITY_TYPE_* */ + __le32 global_seq; /* count connections initiated by this host */ + __le32 connect_seq; /* count connections initiated in this session */ + __le32 protocol_version; + __le32 authorizer_protocol; + __le32 authorizer_len; + __u8 flags; /* CEPH_MSG_CONNECT_* */ +} __attribute__ ((packed)); + +struct ceph_msg_connect_reply { + __u8 tag; + __le64 features; /* feature bits for this session */ + __le32 global_seq; + __le32 connect_seq; + __le32 protocol_version; + __le32 authorizer_len; + __u8 flags; +} __attribute__ ((packed)); + +#define CEPH_MSG_CONNECT_LOSSY 1 /* messages i send may be safely dropped */ + + +/* + * message header + */ +struct ceph_msg_header { + __le64 seq; /* message seq# for this session */ + __le64 tid; /* transaction id */ + __le16 type; /* message type */ + __le16 priority; /* priority. higher value == higher priority */ + __le16 version; /* version of message encoding */ + + __le32 front_len; /* bytes in main payload */ + __le32 middle_len;/* bytes in middle payload */ + __le32 data_len; /* bytes of data payload */ + __le16 data_off; /* sender: include full offset; + receiver: mask against ~PAGE_MASK */ + + struct ceph_entity_inst src, orig_src; + __le32 reserved; + __le32 crc; /* header crc32c */ +} __attribute__ ((packed)); + +#define CEPH_MSG_PRIO_LOW 64 +#define CEPH_MSG_PRIO_DEFAULT 127 +#define CEPH_MSG_PRIO_HIGH 196 +#define CEPH_MSG_PRIO_HIGHEST 255 + +/* + * follows data payload + */ +struct ceph_msg_footer { + __le32 front_crc, middle_crc, data_crc; + __u8 flags; +} __attribute__ ((packed)); + +#define CEPH_MSG_FOOTER_COMPLETE (1<<0) /* msg wasn't aborted */ +#define CEPH_MSG_FOOTER_NOCRC (1<<1) /* no data crc */ + + +#endif diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c new file mode 100644 index 00000000000..dbe63db9762 --- /dev/null +++ b/fs/ceph/osd_client.c @@ -0,0 +1,1537 @@ +#include "ceph_debug.h" + +#include <linux/err.h> +#include <linux/highmem.h> +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/slab.h> +#include <linux/uaccess.h> + +#include "super.h" +#include "osd_client.h" +#include "messenger.h" +#include "decode.h" +#include "auth.h" + +#define OSD_OP_FRONT_LEN 4096 +#define OSD_OPREPLY_FRONT_LEN 512 + +const static struct ceph_connection_operations osd_con_ops; +static int __kick_requests(struct ceph_osd_client *osdc, + struct ceph_osd *kickosd); + +static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd); + +/* + * Implement client access to distributed object storage cluster. + * + * All data objects are stored within a cluster/cloud of OSDs, or + * "object storage devices." (Note that Ceph OSDs have _nothing_ to + * do with the T10 OSD extensions to SCSI.) Ceph OSDs are simply + * remote daemons serving up and coordinating consistent and safe + * access to storage. + * + * Cluster membership and the mapping of data objects onto storage devices + * are described by the osd map. + * + * We keep track of pending OSD requests (read, write), resubmit + * requests to different OSDs when the cluster topology/data layout + * change, or retry the affected requests when the communications + * channel with an OSD is reset. + */ + +/* + * calculate the mapping of a file extent onto an object, and fill out the + * request accordingly. shorten extent as necessary if it crosses an + * object boundary. + * + * fill osd op in request message. + */ +static void calc_layout(struct ceph_osd_client *osdc, + struct ceph_vino vino, struct ceph_file_layout *layout, + u64 off, u64 *plen, + struct ceph_osd_request *req) +{ + struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base; + struct ceph_osd_op *op = (void *)(reqhead + 1); + u64 orig_len = *plen; + u64 objoff, objlen; /* extent in object */ + u64 bno; + + reqhead->snapid = cpu_to_le64(vino.snap); + + /* object extent? */ + ceph_calc_file_object_mapping(layout, off, plen, &bno, + &objoff, &objlen); + if (*plen < orig_len) + dout(" skipping last %llu, final file extent %llu~%llu\n", + orig_len - *plen, off, *plen); + + sprintf(req->r_oid, "%llx.%08llx", vino.ino, bno); + req->r_oid_len = strlen(req->r_oid); + + op->extent.offset = cpu_to_le64(objoff); + op->extent.length = cpu_to_le64(objlen); + req->r_num_pages = calc_pages_for(off, *plen); + + dout("calc_layout %s (%d) %llu~%llu (%d pages)\n", + req->r_oid, req->r_oid_len, objoff, objlen, req->r_num_pages); +} + +/* + * requests + */ +void ceph_osdc_release_request(struct kref *kref) +{ + struct ceph_osd_request *req = container_of(kref, + struct ceph_osd_request, + r_kref); + + if (req->r_request) + ceph_msg_put(req->r_request); + if (req->r_reply) + ceph_msg_put(req->r_reply); + if (req->r_con_filling_msg) { + dout("release_request revoking pages %p from con %p\n", + req->r_pages, req->r_con_filling_msg); + ceph_con_revoke_message(req->r_con_filling_msg, + req->r_reply); + ceph_con_put(req->r_con_filling_msg); + } + if (req->r_own_pages) + ceph_release_page_vector(req->r_pages, + req->r_num_pages); + ceph_put_snap_context(req->r_snapc); + if (req->r_mempool) + mempool_free(req, req->r_osdc->req_mempool); + else + kfree(req); +} + +/* + * build new request AND message, calculate layout, and adjust file + * extent as needed. + * + * if the file was recently truncated, we include information about its + * old and new size so that the object can be updated appropriately. (we + * avoid synchronously deleting truncated objects because it's slow.) + * + * if @do_sync, include a 'startsync' command so that the osd will flush + * data quickly. + */ +struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, + struct ceph_file_layout *layout, + struct ceph_vino vino, + u64 off, u64 *plen, + int opcode, int flags, + struct ceph_snap_context *snapc, + int do_sync, + u32 truncate_seq, + u64 truncate_size, + struct timespec *mtime, + bool use_mempool, int num_reply) +{ + struct ceph_osd_request *req; + struct ceph_msg *msg; + struct ceph_osd_request_head *head; + struct ceph_osd_op *op; + void *p; + int num_op = 1 + do_sync; + size_t msg_size = sizeof(*head) + num_op*sizeof(*op); + int i; + + if (use_mempool) { + req = mempool_alloc(osdc->req_mempool, GFP_NOFS); + memset(req, 0, sizeof(*req)); + } else { + req = kzalloc(sizeof(*req), GFP_NOFS); + } + if (req == NULL) + return ERR_PTR(-ENOMEM); + + req->r_osdc = osdc; + req->r_mempool = use_mempool; + kref_init(&req->r_kref); + init_completion(&req->r_completion); + init_completion(&req->r_safe_completion); + INIT_LIST_HEAD(&req->r_unsafe_item); + req->r_flags = flags; + + WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0); + + /* create reply message */ + if (use_mempool) + msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0); + else + msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, + OSD_OPREPLY_FRONT_LEN, 0, 0, NULL); + if (IS_ERR(msg)) { + ceph_osdc_put_request(req); + return ERR_PTR(PTR_ERR(msg)); + } + req->r_reply = msg; + + /* create request message; allow space for oid */ + msg_size += 40; + if (snapc) + msg_size += sizeof(u64) * snapc->num_snaps; + if (use_mempool) + msg = ceph_msgpool_get(&osdc->msgpool_op, 0); + else + msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, 0, 0, NULL); + if (IS_ERR(msg)) { + ceph_osdc_put_request(req); + return ERR_PTR(PTR_ERR(msg)); + } + msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP); + memset(msg->front.iov_base, 0, msg->front.iov_len); + head = msg->front.iov_base; + op = (void *)(head + 1); + p = (void *)(op + num_op); + + req->r_request = msg; + req->r_snapc = ceph_get_snap_context(snapc); + + head->client_inc = cpu_to_le32(1); /* always, for now. */ + head->flags = cpu_to_le32(flags); + if (flags & CEPH_OSD_FLAG_WRITE) + ceph_encode_timespec(&head->mtime, mtime); + head->num_ops = cpu_to_le16(num_op); + op->op = cpu_to_le16(opcode); + + /* calculate max write size */ + calc_layout(osdc, vino, layout, off, plen, req); + req->r_file_layout = *layout; /* keep a copy */ + + if (flags & CEPH_OSD_FLAG_WRITE) { + req->r_request->hdr.data_off = cpu_to_le16(off); + req->r_request->hdr.data_len = cpu_to_le32(*plen); + op->payload_len = cpu_to_le32(*plen); + } + op->extent.truncate_size = cpu_to_le64(truncate_size); + op->extent.truncate_seq = cpu_to_le32(truncate_seq); + + /* fill in oid */ + head->object_len = cpu_to_le32(req->r_oid_len); + memcpy(p, req->r_oid, req->r_oid_len); + p += req->r_oid_len; + + if (do_sync) { + op++; + op->op = cpu_to_le16(CEPH_OSD_OP_STARTSYNC); + } + if (snapc) { + head->snap_seq = cpu_to_le64(snapc->seq); + head->num_snaps = cpu_to_le32(snapc->num_snaps); + for (i = 0; i < snapc->num_snaps; i++) { + put_unaligned_le64(snapc->snaps[i], p); + p += sizeof(u64); + } + } + + BUG_ON(p > msg->front.iov_base + msg->front.iov_len); + msg_size = p - msg->front.iov_base; + msg->front.iov_len = msg_size; + msg->hdr.front_len = cpu_to_le32(msg_size); + return req; +} + +/* + * We keep osd requests in an rbtree, sorted by ->r_tid. + */ +static void __insert_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *new) +{ + struct rb_node **p = &osdc->requests.rb_node; + struct rb_node *parent = NULL; + struct ceph_osd_request *req = NULL; + + while (*p) { + parent = *p; + req = rb_entry(parent, struct ceph_osd_request, r_node); + if (new->r_tid < req->r_tid) + p = &(*p)->rb_left; + else if (new->r_tid > req->r_tid) + p = &(*p)->rb_right; + else + BUG(); + } + + rb_link_node(&new->r_node, parent, p); + rb_insert_color(&new->r_node, &osdc->requests); +} + +static struct ceph_osd_request *__lookup_request(struct ceph_osd_client *osdc, + u64 tid) +{ + struct ceph_osd_request *req; + struct rb_node *n = osdc->requests.rb_node; + + while (n) { + req = rb_entry(n, struct ceph_osd_request, r_node); + if (tid < req->r_tid) + n = n->rb_left; + else if (tid > req->r_tid) + n = n->rb_right; + else + return req; + } + return NULL; +} + +static struct ceph_osd_request * +__lookup_request_ge(struct ceph_osd_client *osdc, + u64 tid) +{ + struct ceph_osd_request *req; + struct rb_node *n = osdc->requests.rb_node; + + while (n) { + req = rb_entry(n, struct ceph_osd_request, r_node); + if (tid < req->r_tid) { + if (!n->rb_left) + return req; + n = n->rb_left; + } else if (tid > req->r_tid) { + n = n->rb_right; + } else { + return req; + } + } + return NULL; +} + + +/* + * If the osd connection drops, we need to resubmit all requests. + */ +static void osd_reset(struct ceph_connection *con) +{ + struct ceph_osd *osd = con->private; + struct ceph_osd_client *osdc; + + if (!osd) + return; + dout("osd_reset osd%d\n", osd->o_osd); + osdc = osd->o_osdc; + down_read(&osdc->map_sem); + kick_requests(osdc, osd); + up_read(&osdc->map_sem); +} + +/* + * Track open sessions with osds. + */ +static struct ceph_osd *create_osd(struct ceph_osd_client *osdc) +{ + struct ceph_osd *osd; + + osd = kzalloc(sizeof(*osd), GFP_NOFS); + if (!osd) + return NULL; + + atomic_set(&osd->o_ref, 1); + osd->o_osdc = osdc; + INIT_LIST_HEAD(&osd->o_requests); + INIT_LIST_HEAD(&osd->o_osd_lru); + osd->o_incarnation = 1; + + ceph_con_init(osdc->client->msgr, &osd->o_con); + osd->o_con.private = osd; + osd->o_con.ops = &osd_con_ops; + osd->o_con.peer_name.type = CEPH_ENTITY_TYPE_OSD; + + INIT_LIST_HEAD(&osd->o_keepalive_item); + return osd; +} + +static struct ceph_osd *get_osd(struct ceph_osd *osd) +{ + if (atomic_inc_not_zero(&osd->o_ref)) { + dout("get_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref)-1, + atomic_read(&osd->o_ref)); + return osd; + } else { + dout("get_osd %p FAIL\n", osd); + return NULL; + } +} + +static void put_osd(struct ceph_osd *osd) +{ + dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref), + atomic_read(&osd->o_ref) - 1); + if (atomic_dec_and_test(&osd->o_ref)) + kfree(osd); +} + +/* + * remove an osd from our map + */ +static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) +{ + dout("__remove_osd %p\n", osd); + BUG_ON(!list_empty(&osd->o_requests)); + rb_erase(&osd->o_node, &osdc->osds); + list_del_init(&osd->o_osd_lru); + ceph_con_close(&osd->o_con); + put_osd(osd); +} + +static void __move_osd_to_lru(struct ceph_osd_client *osdc, + struct ceph_osd *osd) +{ + dout("__move_osd_to_lru %p\n", osd); + BUG_ON(!list_empty(&osd->o_osd_lru)); + list_add_tail(&osd->o_osd_lru, &osdc->osd_lru); + osd->lru_ttl = jiffies + osdc->client->mount_args->osd_idle_ttl * HZ; +} + +static void __remove_osd_from_lru(struct ceph_osd *osd) +{ + dout("__remove_osd_from_lru %p\n", osd); + if (!list_empty(&osd->o_osd_lru)) + list_del_init(&osd->o_osd_lru); +} + +static void remove_old_osds(struct ceph_osd_client *osdc, int remove_all) +{ + struct ceph_osd *osd, *nosd; + + dout("__remove_old_osds %p\n", osdc); + mutex_lock(&osdc->request_mutex); + list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) { + if (!remove_all && time_before(jiffies, osd->lru_ttl)) + break; + __remove_osd(osdc, osd); + } + mutex_unlock(&osdc->request_mutex); +} + +/* + * reset osd connect + */ +static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) +{ + int ret = 0; + + dout("__reset_osd %p osd%d\n", osd, osd->o_osd); + if (list_empty(&osd->o_requests)) { + __remove_osd(osdc, osd); + } else { + ceph_con_close(&osd->o_con); + ceph_con_open(&osd->o_con, &osdc->osdmap->osd_addr[osd->o_osd]); + osd->o_incarnation++; + } + return ret; +} + +static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new) +{ + struct rb_node **p = &osdc->osds.rb_node; + struct rb_node *parent = NULL; + struct ceph_osd *osd = NULL; + + while (*p) { + parent = *p; + osd = rb_entry(parent, struct ceph_osd, o_node); + if (new->o_osd < osd->o_osd) + p = &(*p)->rb_left; + else if (new->o_osd > osd->o_osd) + p = &(*p)->rb_right; + else + BUG(); + } + + rb_link_node(&new->o_node, parent, p); + rb_insert_color(&new->o_node, &osdc->osds); +} + +static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o) +{ + struct ceph_osd *osd; + struct rb_node *n = osdc->osds.rb_node; + + while (n) { + osd = rb_entry(n, struct ceph_osd, o_node); + if (o < osd->o_osd) + n = n->rb_left; + else if (o > osd->o_osd) + n = n->rb_right; + else + return osd; + } + return NULL; +} + +static void __schedule_osd_timeout(struct ceph_osd_client *osdc) +{ + schedule_delayed_work(&osdc->timeout_work, + osdc->client->mount_args->osd_keepalive_timeout * HZ); +} + +static void __cancel_osd_timeout(struct ceph_osd_client *osdc) +{ + cancel_delayed_work(&osdc->timeout_work); +} + +/* + * Register request, assign tid. If this is the first request, set up + * the timeout event. + */ +static void register_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req) +{ + mutex_lock(&osdc->request_mutex); + req->r_tid = ++osdc->last_tid; + req->r_request->hdr.tid = cpu_to_le64(req->r_tid); + INIT_LIST_HEAD(&req->r_req_lru_item); + + dout("register_request %p tid %lld\n", req, req->r_tid); + __insert_request(osdc, req); + ceph_osdc_get_request(req); + osdc->num_requests++; + + if (osdc->num_requests == 1) { + dout(" first request, scheduling timeout\n"); + __schedule_osd_timeout(osdc); + } + mutex_unlock(&osdc->request_mutex); +} + +/* + * called under osdc->request_mutex + */ +static void __unregister_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req) +{ + dout("__unregister_request %p tid %lld\n", req, req->r_tid); + rb_erase(&req->r_node, &osdc->requests); + osdc->num_requests--; + + if (req->r_osd) { + /* make sure the original request isn't in flight. */ + ceph_con_revoke(&req->r_osd->o_con, req->r_request); + + list_del_init(&req->r_osd_item); + if (list_empty(&req->r_osd->o_requests)) + __move_osd_to_lru(osdc, req->r_osd); + req->r_osd = NULL; + } + + ceph_osdc_put_request(req); + + list_del_init(&req->r_req_lru_item); + if (osdc->num_requests == 0) { + dout(" no requests, canceling timeout\n"); + __cancel_osd_timeout(osdc); + } +} + +/* + * Cancel a previously queued request message + */ +static void __cancel_request(struct ceph_osd_request *req) +{ + if (req->r_sent) { + ceph_con_revoke(&req->r_osd->o_con, req->r_request); + req->r_sent = 0; + } + list_del_init(&req->r_req_lru_item); +} + +/* + * Pick an osd (the first 'up' osd in the pg), allocate the osd struct + * (as needed), and set the request r_osd appropriately. If there is + * no up osd, set r_osd to NULL. + * + * Return 0 if unchanged, 1 if changed, or negative on error. + * + * Caller should hold map_sem for read and request_mutex. + */ +static int __map_osds(struct ceph_osd_client *osdc, + struct ceph_osd_request *req) +{ + struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base; + struct ceph_pg pgid; + int o = -1; + int err; + + dout("map_osds %p tid %lld\n", req, req->r_tid); + err = ceph_calc_object_layout(&reqhead->layout, req->r_oid, + &req->r_file_layout, osdc->osdmap); + if (err) + return err; + pgid = reqhead->layout.ol_pgid; + req->r_pgid = pgid; + + o = ceph_calc_pg_primary(osdc->osdmap, pgid); + + if ((req->r_osd && req->r_osd->o_osd == o && + req->r_sent >= req->r_osd->o_incarnation) || + (req->r_osd == NULL && o == -1)) + return 0; /* no change */ + + dout("map_osds tid %llu pgid %d.%x osd%d (was osd%d)\n", + req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o, + req->r_osd ? req->r_osd->o_osd : -1); + + if (req->r_osd) { + __cancel_request(req); + list_del_init(&req->r_osd_item); + req->r_osd = NULL; + } + + req->r_osd = __lookup_osd(osdc, o); + if (!req->r_osd && o >= 0) { + err = -ENOMEM; + req->r_osd = create_osd(osdc); + if (!req->r_osd) + goto out; + + dout("map_osds osd %p is osd%d\n", req->r_osd, o); + req->r_osd->o_osd = o; + req->r_osd->o_con.peer_name.num = cpu_to_le64(o); + __insert_osd(osdc, req->r_osd); + + ceph_con_open(&req->r_osd->o_con, &osdc->osdmap->osd_addr[o]); + } + + if (req->r_osd) { + __remove_osd_from_lru(req->r_osd); + list_add(&req->r_osd_item, &req->r_osd->o_requests); + } + err = 1; /* osd changed */ + +out: + return err; +} + +/* + * caller should hold map_sem (for read) and request_mutex + */ +static int __send_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req) +{ + struct ceph_osd_request_head *reqhead; + int err; + + err = __map_osds(osdc, req); + if (err < 0) + return err; + if (req->r_osd == NULL) { + dout("send_request %p no up osds in pg\n", req); + ceph_monc_request_next_osdmap(&osdc->client->monc); + return 0; + } + + dout("send_request %p tid %llu to osd%d flags %d\n", + req, req->r_tid, req->r_osd->o_osd, req->r_flags); + + reqhead = req->r_request->front.iov_base; + reqhead->osdmap_epoch = cpu_to_le32(osdc->osdmap->epoch); + reqhead->flags |= cpu_to_le32(req->r_flags); /* e.g., RETRY */ + reqhead->reassert_version = req->r_reassert_version; + + req->r_sent_stamp = jiffies; + list_move_tail(&osdc->req_lru, &req->r_req_lru_item); + + ceph_msg_get(req->r_request); /* send consumes a ref */ + ceph_con_send(&req->r_osd->o_con, req->r_request); + req->r_sent = req->r_osd->o_incarnation; + return 0; +} + +/* + * Timeout callback, called every N seconds when 1 or more osd + * requests has been active for more than N seconds. When this + * happens, we ping all OSDs with requests who have timed out to + * ensure any communications channel reset is detected. Reset the + * request timeouts another N seconds in the future as we go. + * Reschedule the timeout event another N seconds in future (unless + * there are no open requests). + */ +static void handle_timeout(struct work_struct *work) +{ + struct ceph_osd_client *osdc = + container_of(work, struct ceph_osd_client, timeout_work.work); + struct ceph_osd_request *req, *last_req = NULL; + struct ceph_osd *osd; + unsigned long timeout = osdc->client->mount_args->osd_timeout * HZ; + unsigned long keepalive = + osdc->client->mount_args->osd_keepalive_timeout * HZ; + unsigned long last_sent = 0; + struct rb_node *p; + struct list_head slow_osds; + + dout("timeout\n"); + down_read(&osdc->map_sem); + + ceph_monc_request_next_osdmap(&osdc->client->monc); + + mutex_lock(&osdc->request_mutex); + for (p = rb_first(&osdc->requests); p; p = rb_next(p)) { + req = rb_entry(p, struct ceph_osd_request, r_node); + + if (req->r_resend) { + int err; + + dout("osdc resending prev failed %lld\n", req->r_tid); + err = __send_request(osdc, req); + if (err) + dout("osdc failed again on %lld\n", req->r_tid); + else + req->r_resend = false; + continue; + } + } + + /* + * reset osds that appear to be _really_ unresponsive. this + * is a failsafe measure.. we really shouldn't be getting to + * this point if the system is working properly. the monitors + * should mark the osd as failed and we should find out about + * it from an updated osd map. + */ + while (!list_empty(&osdc->req_lru)) { + req = list_entry(osdc->req_lru.next, struct ceph_osd_request, + r_req_lru_item); + + if (time_before(jiffies, req->r_sent_stamp + timeout)) + break; + + BUG_ON(req == last_req && req->r_sent_stamp == last_sent); + last_req = req; + last_sent = req->r_sent_stamp; + + osd = req->r_osd; + BUG_ON(!osd); + pr_warning(" tid %llu timed out on osd%d, will reset osd\n", + req->r_tid, osd->o_osd); + __kick_requests(osdc, osd); + } + + /* + * ping osds that are a bit slow. this ensures that if there + * is a break in the TCP connection we will notice, and reopen + * a connection with that osd (from the fault callback). + */ + INIT_LIST_HEAD(&slow_osds); + list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) { + if (time_before(jiffies, req->r_sent_stamp + keepalive)) + break; + + osd = req->r_osd; + BUG_ON(!osd); + dout(" tid %llu is slow, will send keepalive on osd%d\n", + req->r_tid, osd->o_osd); + list_move_tail(&osd->o_keepalive_item, &slow_osds); + } + while (!list_empty(&slow_osds)) { + osd = list_entry(slow_osds.next, struct ceph_osd, + o_keepalive_item); + list_del_init(&osd->o_keepalive_item); + ceph_con_keepalive(&osd->o_con); + } + + __schedule_osd_timeout(osdc); + mutex_unlock(&osdc->request_mutex); + + up_read(&osdc->map_sem); +} + +static void handle_osds_timeout(struct work_struct *work) +{ + struct ceph_osd_client *osdc = + container_of(work, struct ceph_osd_client, + osds_timeout_work.work); + unsigned long delay = + osdc->client->mount_args->osd_idle_ttl * HZ >> 2; + + dout("osds timeout\n"); + down_read(&osdc->map_sem); + remove_old_osds(osdc, 0); + up_read(&osdc->map_sem); + + schedule_delayed_work(&osdc->osds_timeout_work, + round_jiffies_relative(delay)); +} + +/* + * handle osd op reply. either call the callback if it is specified, + * or do the completion to wake up the waiting thread. + */ +static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, + struct ceph_connection *con) +{ + struct ceph_osd_reply_head *rhead = msg->front.iov_base; + struct ceph_osd_request *req; + u64 tid; + int numops, object_len, flags; + + tid = le64_to_cpu(msg->hdr.tid); + if (msg->front.iov_len < sizeof(*rhead)) + goto bad; + numops = le32_to_cpu(rhead->num_ops); + object_len = le32_to_cpu(rhead->object_len); + if (msg->front.iov_len != sizeof(*rhead) + object_len + + numops * sizeof(struct ceph_osd_op)) + goto bad; + dout("handle_reply %p tid %llu\n", msg, tid); + + /* lookup */ + mutex_lock(&osdc->request_mutex); + req = __lookup_request(osdc, tid); + if (req == NULL) { + dout("handle_reply tid %llu dne\n", tid); + mutex_unlock(&osdc->request_mutex); + return; + } + ceph_osdc_get_request(req); + flags = le32_to_cpu(rhead->flags); + + /* + * if this connection filled our message, drop our reference now, to + * avoid a (safe but slower) revoke later. + */ + if (req->r_con_filling_msg == con && req->r_reply == msg) { + dout(" dropping con_filling_msg ref %p\n", con); + req->r_con_filling_msg = NULL; + ceph_con_put(con); + } + + if (!req->r_got_reply) { + unsigned bytes; + + req->r_result = le32_to_cpu(rhead->result); + bytes = le32_to_cpu(msg->hdr.data_len); + dout("handle_reply result %d bytes %d\n", req->r_result, + bytes); + if (req->r_result == 0) + req->r_result = bytes; + + /* in case this is a write and we need to replay, */ + req->r_reassert_version = rhead->reassert_version; + + req->r_got_reply = 1; + } else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) { + dout("handle_reply tid %llu dup ack\n", tid); + mutex_unlock(&osdc->request_mutex); + goto done; + } + + dout("handle_reply tid %llu flags %d\n", tid, flags); + + /* either this is a read, or we got the safe response */ + if ((flags & CEPH_OSD_FLAG_ONDISK) || + ((flags & CEPH_OSD_FLAG_WRITE) == 0)) + __unregister_request(osdc, req); + + mutex_unlock(&osdc->request_mutex); + + if (req->r_callback) + req->r_callback(req, msg); + else + complete(&req->r_completion); + + if (flags & CEPH_OSD_FLAG_ONDISK) { + if (req->r_safe_callback) + req->r_safe_callback(req, msg); + complete(&req->r_safe_completion); /* fsync waiter */ + } + +done: + ceph_osdc_put_request(req); + return; + +bad: + pr_err("corrupt osd_op_reply got %d %d expected %d\n", + (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len), + (int)sizeof(*rhead)); + ceph_msg_dump(msg); +} + + +static int __kick_requests(struct ceph_osd_client *osdc, + struct ceph_osd *kickosd) +{ + struct ceph_osd_request *req; + struct rb_node *p, *n; + int needmap = 0; + int err; + + dout("kick_requests osd%d\n", kickosd ? kickosd->o_osd : -1); + if (kickosd) { + __reset_osd(osdc, kickosd); + } else { + for (p = rb_first(&osdc->osds); p; p = n) { + struct ceph_osd *osd = + rb_entry(p, struct ceph_osd, o_node); + + n = rb_next(p); + if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) || + memcmp(&osd->o_con.peer_addr, + ceph_osd_addr(osdc->osdmap, + osd->o_osd), + sizeof(struct ceph_entity_addr)) != 0) + __reset_osd(osdc, osd); + } + } + + for (p = rb_first(&osdc->requests); p; p = rb_next(p)) { + req = rb_entry(p, struct ceph_osd_request, r_node); + + if (req->r_resend) { + dout(" r_resend set on tid %llu\n", req->r_tid); + __cancel_request(req); + goto kick; + } + if (req->r_osd && kickosd == req->r_osd) { + __cancel_request(req); + goto kick; + } + + err = __map_osds(osdc, req); + if (err == 0) + continue; /* no change */ + if (err < 0) { + /* + * FIXME: really, we should set the request + * error and fail if this isn't a 'nofail' + * request, but that's a fair bit more + * complicated to do. So retry! + */ + dout(" setting r_resend on %llu\n", req->r_tid); + req->r_resend = true; + continue; + } + if (req->r_osd == NULL) { + dout("tid %llu maps to no valid osd\n", req->r_tid); + needmap++; /* request a newer map */ + continue; + } + +kick: + dout("kicking %p tid %llu osd%d\n", req, req->r_tid, + req->r_osd->o_osd); + req->r_flags |= CEPH_OSD_FLAG_RETRY; + err = __send_request(osdc, req); + if (err) { + dout(" setting r_resend on %llu\n", req->r_tid); + req->r_resend = true; + } + } + + return needmap; +} + +/* + * Resubmit osd requests whose osd or osd address has changed. Request + * a new osd map if osds are down, or we are otherwise unable to determine + * how to direct a request. + * + * Close connections to down osds. + * + * If @who is specified, resubmit requests for that specific osd. + * + * Caller should hold map_sem for read and request_mutex. + */ +static void kick_requests(struct ceph_osd_client *osdc, + struct ceph_osd *kickosd) +{ + int needmap; + + mutex_lock(&osdc->request_mutex); + needmap = __kick_requests(osdc, kickosd); + mutex_unlock(&osdc->request_mutex); + + if (needmap) { + dout("%d requests for down osds, need new map\n", needmap); + ceph_monc_request_next_osdmap(&osdc->client->monc); + } + +} +/* + * Process updated osd map. + * + * The message contains any number of incremental and full maps, normally + * indicating some sort of topology change in the cluster. Kick requests + * off to different OSDs as needed. + */ +void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) +{ + void *p, *end, *next; + u32 nr_maps, maplen; + u32 epoch; + struct ceph_osdmap *newmap = NULL, *oldmap; + int err; + struct ceph_fsid fsid; + + dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0); + p = msg->front.iov_base; + end = p + msg->front.iov_len; + + /* verify fsid */ + ceph_decode_need(&p, end, sizeof(fsid), bad); + ceph_decode_copy(&p, &fsid, sizeof(fsid)); + if (ceph_check_fsid(osdc->client, &fsid) < 0) + return; + + down_write(&osdc->map_sem); + + /* incremental maps */ + ceph_decode_32_safe(&p, end, nr_maps, bad); + dout(" %d inc maps\n", nr_maps); + while (nr_maps > 0) { + ceph_decode_need(&p, end, 2*sizeof(u32), bad); + epoch = ceph_decode_32(&p); + maplen = ceph_decode_32(&p); + ceph_decode_need(&p, end, maplen, bad); + next = p + maplen; + if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) { + dout("applying incremental map %u len %d\n", + epoch, maplen); + newmap = osdmap_apply_incremental(&p, next, + osdc->osdmap, + osdc->client->msgr); + if (IS_ERR(newmap)) { + err = PTR_ERR(newmap); + goto bad; + } + BUG_ON(!newmap); + if (newmap != osdc->osdmap) { + ceph_osdmap_destroy(osdc->osdmap); + osdc->osdmap = newmap; + } + } else { + dout("ignoring incremental map %u len %d\n", + epoch, maplen); + } + p = next; + nr_maps--; + } + if (newmap) + goto done; + + /* full maps */ + ceph_decode_32_safe(&p, end, nr_maps, bad); + dout(" %d full maps\n", nr_maps); + while (nr_maps) { + ceph_decode_need(&p, end, 2*sizeof(u32), bad); + epoch = ceph_decode_32(&p); + maplen = ceph_decode_32(&p); + ceph_decode_need(&p, end, maplen, bad); + if (nr_maps > 1) { + dout("skipping non-latest full map %u len %d\n", + epoch, maplen); + } else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) { + dout("skipping full map %u len %d, " + "older than our %u\n", epoch, maplen, + osdc->osdmap->epoch); + } else { + dout("taking full map %u len %d\n", epoch, maplen); + newmap = osdmap_decode(&p, p+maplen); + if (IS_ERR(newmap)) { + err = PTR_ERR(newmap); + goto bad; + } + BUG_ON(!newmap); + oldmap = osdc->osdmap; + osdc->osdmap = newmap; + if (oldmap) + ceph_osdmap_destroy(oldmap); + } + p += maplen; + nr_maps--; + } + +done: + downgrade_write(&osdc->map_sem); + ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch); + if (newmap) + kick_requests(osdc, NULL); + up_read(&osdc->map_sem); + return; + +bad: + pr_err("osdc handle_map corrupt msg\n"); + ceph_msg_dump(msg); + up_write(&osdc->map_sem); + return; +} + + +/* + * A read request prepares specific pages that data is to be read into. + * When a message is being read off the wire, we call prepare_pages to + * find those pages. + * 0 = success, -1 failure. + */ +static int __prepare_pages(struct ceph_connection *con, + struct ceph_msg_header *hdr, + struct ceph_osd_request *req, + u64 tid, + struct ceph_msg *m) +{ + struct ceph_osd *osd = con->private; + struct ceph_osd_client *osdc; + int ret = -1; + int data_len = le32_to_cpu(hdr->data_len); + unsigned data_off = le16_to_cpu(hdr->data_off); + + int want = calc_pages_for(data_off & ~PAGE_MASK, data_len); + + if (!osd) + return -1; + + osdc = osd->o_osdc; + + dout("__prepare_pages on msg %p tid %llu, has %d pages, want %d\n", m, + tid, req->r_num_pages, want); + if (unlikely(req->r_num_pages < want)) + goto out; + m->pages = req->r_pages; + m->nr_pages = req->r_num_pages; + ret = 0; /* success */ +out: + BUG_ON(ret < 0 || m->nr_pages < want); + + return ret; +} + +/* + * Register request, send initial attempt. + */ +int ceph_osdc_start_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req, + bool nofail) +{ + int rc = 0; + + req->r_request->pages = req->r_pages; + req->r_request->nr_pages = req->r_num_pages; + + register_request(osdc, req); + + down_read(&osdc->map_sem); + mutex_lock(&osdc->request_mutex); + /* + * a racing kick_requests() may have sent the message for us + * while we dropped request_mutex above, so only send now if + * the request still han't been touched yet. + */ + if (req->r_sent == 0) { + rc = __send_request(osdc, req); + if (rc) { + if (nofail) { + dout("osdc_start_request failed send, " + " marking %lld\n", req->r_tid); + req->r_resend = true; + rc = 0; + } else { + __unregister_request(osdc, req); + } + } + } + mutex_unlock(&osdc->request_mutex); + up_read(&osdc->map_sem); + return rc; +} + +/* + * wait for a request to complete + */ +int ceph_osdc_wait_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req) +{ + int rc; + + rc = wait_for_completion_interruptible(&req->r_completion); + if (rc < 0) { + mutex_lock(&osdc->request_mutex); + __cancel_request(req); + __unregister_request(osdc, req); + mutex_unlock(&osdc->request_mutex); + dout("wait_request tid %llu canceled/timed out\n", req->r_tid); + return rc; + } + + dout("wait_request tid %llu result %d\n", req->r_tid, req->r_result); + return req->r_result; +} + +/* + * sync - wait for all in-flight requests to flush. avoid starvation. + */ +void ceph_osdc_sync(struct ceph_osd_client *osdc) +{ + struct ceph_osd_request *req; + u64 last_tid, next_tid = 0; + + mutex_lock(&osdc->request_mutex); + last_tid = osdc->last_tid; + while (1) { + req = __lookup_request_ge(osdc, next_tid); + if (!req) + break; + if (req->r_tid > last_tid) + break; + + next_tid = req->r_tid + 1; + if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0) + continue; + + ceph_osdc_get_request(req); + mutex_unlock(&osdc->request_mutex); + dout("sync waiting on tid %llu (last is %llu)\n", + req->r_tid, last_tid); + wait_for_completion(&req->r_safe_completion); + mutex_lock(&osdc->request_mutex); + ceph_osdc_put_request(req); + } + mutex_unlock(&osdc->request_mutex); + dout("sync done (thru tid %llu)\n", last_tid); +} + +/* + * init, shutdown + */ +int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) +{ + int err; + + dout("init\n"); + osdc->client = client; + osdc->osdmap = NULL; + init_rwsem(&osdc->map_sem); + init_completion(&osdc->map_waiters); + osdc->last_requested_map = 0; + mutex_init(&osdc->request_mutex); + osdc->last_tid = 0; + osdc->osds = RB_ROOT; + INIT_LIST_HEAD(&osdc->osd_lru); + osdc->requests = RB_ROOT; + INIT_LIST_HEAD(&osdc->req_lru); + osdc->num_requests = 0; + INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout); + INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout); + + schedule_delayed_work(&osdc->osds_timeout_work, + round_jiffies_relative(osdc->client->mount_args->osd_idle_ttl * HZ)); + + err = -ENOMEM; + osdc->req_mempool = mempool_create_kmalloc_pool(10, + sizeof(struct ceph_osd_request)); + if (!osdc->req_mempool) + goto out; + + err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true); + if (err < 0) + goto out_mempool; + err = ceph_msgpool_init(&osdc->msgpool_op_reply, + OSD_OPREPLY_FRONT_LEN, 10, true); + if (err < 0) + goto out_msgpool; + return 0; + +out_msgpool: + ceph_msgpool_destroy(&osdc->msgpool_op); +out_mempool: + mempool_destroy(osdc->req_mempool); +out: + return err; +} + +void ceph_osdc_stop(struct ceph_osd_client *osdc) +{ + cancel_delayed_work_sync(&osdc->timeout_work); + cancel_delayed_work_sync(&osdc->osds_timeout_work); + if (osdc->osdmap) { + ceph_osdmap_destroy(osdc->osdmap); + osdc->osdmap = NULL; + } + remove_old_osds(osdc, 1); + mempool_destroy(osdc->req_mempool); + ceph_msgpool_destroy(&osdc->msgpool_op); + ceph_msgpool_destroy(&osdc->msgpool_op_reply); +} + +/* + * Read some contiguous pages. If we cross a stripe boundary, shorten + * *plen. Return number of bytes read, or error. + */ +int ceph_osdc_readpages(struct ceph_osd_client *osdc, + struct ceph_vino vino, struct ceph_file_layout *layout, + u64 off, u64 *plen, + u32 truncate_seq, u64 truncate_size, + struct page **pages, int num_pages) +{ + struct ceph_osd_request *req; + int rc = 0; + + dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino, + vino.snap, off, *plen); + req = ceph_osdc_new_request(osdc, layout, vino, off, plen, + CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, + NULL, 0, truncate_seq, truncate_size, NULL, + false, 1); + if (IS_ERR(req)) + return PTR_ERR(req); + + /* it may be a short read due to an object boundary */ + req->r_pages = pages; + num_pages = calc_pages_for(off, *plen); + req->r_num_pages = num_pages; + + dout("readpages final extent is %llu~%llu (%d pages)\n", + off, *plen, req->r_num_pages); + + rc = ceph_osdc_start_request(osdc, req, false); + if (!rc) + rc = ceph_osdc_wait_request(osdc, req); + + ceph_osdc_put_request(req); + dout("readpages result %d\n", rc); + return rc; +} + +/* + * do a synchronous write on N pages + */ +int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, + struct ceph_file_layout *layout, + struct ceph_snap_context *snapc, + u64 off, u64 len, + u32 truncate_seq, u64 truncate_size, + struct timespec *mtime, + struct page **pages, int num_pages, + int flags, int do_sync, bool nofail) +{ + struct ceph_osd_request *req; + int rc = 0; + + BUG_ON(vino.snap != CEPH_NOSNAP); + req = ceph_osdc_new_request(osdc, layout, vino, off, &len, + CEPH_OSD_OP_WRITE, + flags | CEPH_OSD_FLAG_ONDISK | + CEPH_OSD_FLAG_WRITE, + snapc, do_sync, + truncate_seq, truncate_size, mtime, + nofail, 1); + if (IS_ERR(req)) + return PTR_ERR(req); + + /* it may be a short write due to an object boundary */ + req->r_pages = pages; + req->r_num_pages = calc_pages_for(off, len); + dout("writepages %llu~%llu (%d pages)\n", off, len, + req->r_num_pages); + + rc = ceph_osdc_start_request(osdc, req, nofail); + if (!rc) + rc = ceph_osdc_wait_request(osdc, req); + + ceph_osdc_put_request(req); + if (rc == 0) + rc = len; + dout("writepages result %d\n", rc); + return rc; +} + +/* + * handle incoming message + */ +static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) +{ + struct ceph_osd *osd = con->private; + struct ceph_osd_client *osdc; + int type = le16_to_cpu(msg->hdr.type); + + if (!osd) + return; + osdc = osd->o_osdc; + + switch (type) { + case CEPH_MSG_OSD_MAP: + ceph_osdc_handle_map(osdc, msg); + break; + case CEPH_MSG_OSD_OPREPLY: + handle_reply(osdc, msg, con); + break; + + default: + pr_err("received unknown message type %d %s\n", type, + ceph_msg_type_name(type)); + } + ceph_msg_put(msg); +} + +/* + * lookup and return message for incoming reply + */ +static struct ceph_msg *get_reply(struct ceph_connection *con, + struct ceph_msg_header *hdr, + int *skip) +{ + struct ceph_osd *osd = con->private; + struct ceph_osd_client *osdc = osd->o_osdc; + struct ceph_msg *m; + struct ceph_osd_request *req; + int front = le32_to_cpu(hdr->front_len); + int data_len = le32_to_cpu(hdr->data_len); + u64 tid; + int err; + + tid = le64_to_cpu(hdr->tid); + mutex_lock(&osdc->request_mutex); + req = __lookup_request(osdc, tid); + if (!req) { + *skip = 1; + m = NULL; + pr_info("get_reply unknown tid %llu from osd%d\n", tid, + osd->o_osd); + goto out; + } + + if (req->r_con_filling_msg) { + dout("get_reply revoking msg %p from old con %p\n", + req->r_reply, req->r_con_filling_msg); + ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply); + ceph_con_put(req->r_con_filling_msg); + } + + if (front > req->r_reply->front.iov_len) { + pr_warning("get_reply front %d > preallocated %d\n", + front, (int)req->r_reply->front.iov_len); + m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, 0, 0, NULL); + if (IS_ERR(m)) + goto out; + ceph_msg_put(req->r_reply); + req->r_reply = m; + } + m = ceph_msg_get(req->r_reply); + + if (data_len > 0) { + err = __prepare_pages(con, hdr, req, tid, m); + if (err < 0) { + *skip = 1; + ceph_msg_put(m); + m = ERR_PTR(err); + } + } + *skip = 0; + req->r_con_filling_msg = ceph_con_get(con); + dout("get_reply tid %lld %p\n", tid, m); + +out: + mutex_unlock(&osdc->request_mutex); + return m; + +} + +static struct ceph_msg *alloc_msg(struct ceph_connection *con, + struct ceph_msg_header *hdr, + int *skip) +{ + struct ceph_osd *osd = con->private; + int type = le16_to_cpu(hdr->type); + int front = le32_to_cpu(hdr->front_len); + + switch (type) { + case CEPH_MSG_OSD_MAP: + return ceph_msg_new(type, front, 0, 0, NULL); + case CEPH_MSG_OSD_OPREPLY: + return get_reply(con, hdr, skip); + default: + pr_info("alloc_msg unexpected msg type %d from osd%d\n", type, + osd->o_osd); + *skip = 1; + return NULL; + } +} + +/* + * Wrappers to refcount containing ceph_osd struct + */ +static struct ceph_connection *get_osd_con(struct ceph_connection *con) +{ + struct ceph_osd *osd = con->private; + if (get_osd(osd)) + return con; + return NULL; +} + +static void put_osd_con(struct ceph_connection *con) +{ + struct ceph_osd *osd = con->private; + put_osd(osd); +} + +/* + * authentication + */ +static int get_authorizer(struct ceph_connection *con, + void **buf, int *len, int *proto, + void **reply_buf, int *reply_len, int force_new) +{ + struct ceph_osd *o = con->private; + struct ceph_osd_client *osdc = o->o_osdc; + struct ceph_auth_client *ac = osdc->client->monc.auth; + int ret = 0; + + if (force_new && o->o_authorizer) { + ac->ops->destroy_authorizer(ac, o->o_authorizer); + o->o_authorizer = NULL; + } + if (o->o_authorizer == NULL) { + ret = ac->ops->create_authorizer( + ac, CEPH_ENTITY_TYPE_OSD, + &o->o_authorizer, + &o->o_authorizer_buf, + &o->o_authorizer_buf_len, + &o->o_authorizer_reply_buf, + &o->o_authorizer_reply_buf_len); + if (ret) + return ret; + } + + *proto = ac->protocol; + *buf = o->o_authorizer_buf; + *len = o->o_authorizer_buf_len; + *reply_buf = o->o_authorizer_reply_buf; + *reply_len = o->o_authorizer_reply_buf_len; + return 0; +} + + +static int verify_authorizer_reply(struct ceph_connection *con, int len) +{ + struct ceph_osd *o = con->private; + struct ceph_osd_client *osdc = o->o_osdc; + struct ceph_auth_client *ac = osdc->client->monc.auth; + + return ac->ops->verify_authorizer_reply(ac, o->o_authorizer, len); +} + +static int invalidate_authorizer(struct ceph_connection *con) +{ + struct ceph_osd *o = con->private; + struct ceph_osd_client *osdc = o->o_osdc; + struct ceph_auth_client *ac = osdc->client->monc.auth; + + if (ac->ops->invalidate_authorizer) + ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD); + + return ceph_monc_validate_auth(&osdc->client->monc); +} + +const static struct ceph_connection_operations osd_con_ops = { + .get = get_osd_con, + .put = put_osd_con, + .dispatch = dispatch, + .get_authorizer = get_authorizer, + .verify_authorizer_reply = verify_authorizer_reply, + .invalidate_authorizer = invalidate_authorizer, + .alloc_msg = alloc_msg, + .fault = osd_reset, +}; diff --git a/fs/ceph/osd_client.h b/fs/ceph/osd_client.h new file mode 100644 index 00000000000..1b1a3ca43af --- /dev/null +++ b/fs/ceph/osd_client.h @@ -0,0 +1,166 @@ +#ifndef _FS_CEPH_OSD_CLIENT_H +#define _FS_CEPH_OSD_CLIENT_H + +#include <linux/completion.h> +#include <linux/kref.h> +#include <linux/mempool.h> +#include <linux/rbtree.h> + +#include "types.h" +#include "osdmap.h" +#include "messenger.h" + +struct ceph_msg; +struct ceph_snap_context; +struct ceph_osd_request; +struct ceph_osd_client; +struct ceph_authorizer; + +/* + * completion callback for async writepages + */ +typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *, + struct ceph_msg *); + +/* a given osd we're communicating with */ +struct ceph_osd { + atomic_t o_ref; + struct ceph_osd_client *o_osdc; + int o_osd; + int o_incarnation; + struct rb_node o_node; + struct ceph_connection o_con; + struct list_head o_requests; + struct list_head o_osd_lru; + struct ceph_authorizer *o_authorizer; + void *o_authorizer_buf, *o_authorizer_reply_buf; + size_t o_authorizer_buf_len, o_authorizer_reply_buf_len; + unsigned long lru_ttl; + int o_marked_for_keepalive; + struct list_head o_keepalive_item; +}; + +/* an in-flight request */ +struct ceph_osd_request { + u64 r_tid; /* unique for this client */ + struct rb_node r_node; + struct list_head r_req_lru_item; + struct list_head r_osd_item; + struct ceph_osd *r_osd; + struct ceph_pg r_pgid; + + struct ceph_connection *r_con_filling_msg; + + struct ceph_msg *r_request, *r_reply; + int r_result; + int r_flags; /* any additional flags for the osd */ + u32 r_sent; /* >0 if r_request is sending/sent */ + int r_got_reply; + + struct ceph_osd_client *r_osdc; + struct kref r_kref; + bool r_mempool; + struct completion r_completion, r_safe_completion; + ceph_osdc_callback_t r_callback, r_safe_callback; + struct ceph_eversion r_reassert_version; + struct list_head r_unsafe_item; + + struct inode *r_inode; /* for use by callbacks */ + struct writeback_control *r_wbc; /* ditto */ + + char r_oid[40]; /* object name */ + int r_oid_len; + unsigned long r_sent_stamp; + bool r_resend; /* msg send failed, needs retry */ + + struct ceph_file_layout r_file_layout; + struct ceph_snap_context *r_snapc; /* snap context for writes */ + unsigned r_num_pages; /* size of page array (follows) */ + struct page **r_pages; /* pages for data payload */ + int r_pages_from_pool; + int r_own_pages; /* if true, i own page list */ +}; + +struct ceph_osd_client { + struct ceph_client *client; + + struct ceph_osdmap *osdmap; /* current map */ + struct rw_semaphore map_sem; + struct completion map_waiters; + u64 last_requested_map; + + struct mutex request_mutex; + struct rb_root osds; /* osds */ + struct list_head osd_lru; /* idle osds */ + u64 timeout_tid; /* tid of timeout triggering rq */ + u64 last_tid; /* tid of last request */ + struct rb_root requests; /* pending requests */ + struct list_head req_lru; /* pending requests lru */ + int num_requests; + struct delayed_work timeout_work; + struct delayed_work osds_timeout_work; +#ifdef CONFIG_DEBUG_FS + struct dentry *debugfs_file; +#endif + + mempool_t *req_mempool; + + struct ceph_msgpool msgpool_op; + struct ceph_msgpool msgpool_op_reply; +}; + +extern int ceph_osdc_init(struct ceph_osd_client *osdc, + struct ceph_client *client); +extern void ceph_osdc_stop(struct ceph_osd_client *osdc); + +extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc, + struct ceph_msg *msg); +extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, + struct ceph_msg *msg); + +extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, + struct ceph_file_layout *layout, + struct ceph_vino vino, + u64 offset, u64 *len, int op, int flags, + struct ceph_snap_context *snapc, + int do_sync, u32 truncate_seq, + u64 truncate_size, + struct timespec *mtime, + bool use_mempool, int num_reply); + +static inline void ceph_osdc_get_request(struct ceph_osd_request *req) +{ + kref_get(&req->r_kref); +} +extern void ceph_osdc_release_request(struct kref *kref); +static inline void ceph_osdc_put_request(struct ceph_osd_request *req) +{ + kref_put(&req->r_kref, ceph_osdc_release_request); +} + +extern int ceph_osdc_start_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req, + bool nofail); +extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req); +extern void ceph_osdc_sync(struct ceph_osd_client *osdc); + +extern int ceph_osdc_readpages(struct ceph_osd_client *osdc, + struct ceph_vino vino, + struct ceph_file_layout *layout, + u64 off, u64 *plen, + u32 truncate_seq, u64 truncate_size, + struct page **pages, int nr_pages); + +extern int ceph_osdc_writepages(struct ceph_osd_client *osdc, + struct ceph_vino vino, + struct ceph_file_layout *layout, + struct ceph_snap_context *sc, + u64 off, u64 len, + u32 truncate_seq, u64 truncate_size, + struct timespec *mtime, + struct page **pages, int nr_pages, + int flags, int do_sync, bool nofail); + +#endif + diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c new file mode 100644 index 00000000000..b83f2692b83 --- /dev/null +++ b/fs/ceph/osdmap.c @@ -0,0 +1,1019 @@ + +#include <asm/div64.h> + +#include "super.h" +#include "osdmap.h" +#include "crush/hash.h" +#include "crush/mapper.h" +#include "decode.h" +#include "ceph_debug.h" + +char *ceph_osdmap_state_str(char *str, int len, int state) +{ + int flag = 0; + + if (!len) + goto done; + + *str = '\0'; + if (state) { + if (state & CEPH_OSD_EXISTS) { + snprintf(str, len, "exists"); + flag = 1; + } + if (state & CEPH_OSD_UP) { + snprintf(str, len, "%s%s%s", str, (flag ? ", " : ""), + "up"); + flag = 1; + } + } else { + snprintf(str, len, "doesn't exist"); + } +done: + return str; +} + +/* maps */ + +static int calc_bits_of(unsigned t) +{ + int b = 0; + while (t) { + t = t >> 1; + b++; + } + return b; +} + +/* + * the foo_mask is the smallest value 2^n-1 that is >= foo. + */ +static void calc_pg_masks(struct ceph_pg_pool_info *pi) +{ + pi->pg_num_mask = (1 << calc_bits_of(le32_to_cpu(pi->v.pg_num)-1)) - 1; + pi->pgp_num_mask = + (1 << calc_bits_of(le32_to_cpu(pi->v.pgp_num)-1)) - 1; + pi->lpg_num_mask = + (1 << calc_bits_of(le32_to_cpu(pi->v.lpg_num)-1)) - 1; + pi->lpgp_num_mask = + (1 << calc_bits_of(le32_to_cpu(pi->v.lpgp_num)-1)) - 1; +} + +/* + * decode crush map + */ +static int crush_decode_uniform_bucket(void **p, void *end, + struct crush_bucket_uniform *b) +{ + dout("crush_decode_uniform_bucket %p to %p\n", *p, end); + ceph_decode_need(p, end, (1+b->h.size) * sizeof(u32), bad); + b->item_weight = ceph_decode_32(p); + return 0; +bad: + return -EINVAL; +} + +static int crush_decode_list_bucket(void **p, void *end, + struct crush_bucket_list *b) +{ + int j; + dout("crush_decode_list_bucket %p to %p\n", *p, end); + b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS); + if (b->item_weights == NULL) + return -ENOMEM; + b->sum_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS); + if (b->sum_weights == NULL) + return -ENOMEM; + ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad); + for (j = 0; j < b->h.size; j++) { + b->item_weights[j] = ceph_decode_32(p); + b->sum_weights[j] = ceph_decode_32(p); + } + return 0; +bad: + return -EINVAL; +} + +static int crush_decode_tree_bucket(void **p, void *end, + struct crush_bucket_tree *b) +{ + int j; + dout("crush_decode_tree_bucket %p to %p\n", *p, end); + ceph_decode_32_safe(p, end, b->num_nodes, bad); + b->node_weights = kcalloc(b->num_nodes, sizeof(u32), GFP_NOFS); + if (b->node_weights == NULL) + return -ENOMEM; + ceph_decode_need(p, end, b->num_nodes * sizeof(u32), bad); + for (j = 0; j < b->num_nodes; j++) + b->node_weights[j] = ceph_decode_32(p); + return 0; +bad: + return -EINVAL; +} + +static int crush_decode_straw_bucket(void **p, void *end, + struct crush_bucket_straw *b) +{ + int j; + dout("crush_decode_straw_bucket %p to %p\n", *p, end); + b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS); + if (b->item_weights == NULL) + return -ENOMEM; + b->straws = kcalloc(b->h.size, sizeof(u32), GFP_NOFS); + if (b->straws == NULL) + return -ENOMEM; + ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad); + for (j = 0; j < b->h.size; j++) { + b->item_weights[j] = ceph_decode_32(p); + b->straws[j] = ceph_decode_32(p); + } + return 0; +bad: + return -EINVAL; +} + +static struct crush_map *crush_decode(void *pbyval, void *end) +{ + struct crush_map *c; + int err = -EINVAL; + int i, j; + void **p = &pbyval; + void *start = pbyval; + u32 magic; + + dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p)); + + c = kzalloc(sizeof(*c), GFP_NOFS); + if (c == NULL) + return ERR_PTR(-ENOMEM); + + ceph_decode_need(p, end, 4*sizeof(u32), bad); + magic = ceph_decode_32(p); + if (magic != CRUSH_MAGIC) { + pr_err("crush_decode magic %x != current %x\n", + (unsigned)magic, (unsigned)CRUSH_MAGIC); + goto bad; + } + c->max_buckets = ceph_decode_32(p); + c->max_rules = ceph_decode_32(p); + c->max_devices = ceph_decode_32(p); + + c->device_parents = kcalloc(c->max_devices, sizeof(u32), GFP_NOFS); + if (c->device_parents == NULL) + goto badmem; + c->bucket_parents = kcalloc(c->max_buckets, sizeof(u32), GFP_NOFS); + if (c->bucket_parents == NULL) + goto badmem; + + c->buckets = kcalloc(c->max_buckets, sizeof(*c->buckets), GFP_NOFS); + if (c->buckets == NULL) + goto badmem; + c->rules = kcalloc(c->max_rules, sizeof(*c->rules), GFP_NOFS); + if (c->rules == NULL) + goto badmem; + + /* buckets */ + for (i = 0; i < c->max_buckets; i++) { + int size = 0; + u32 alg; + struct crush_bucket *b; + + ceph_decode_32_safe(p, end, alg, bad); + if (alg == 0) { + c->buckets[i] = NULL; + continue; + } + dout("crush_decode bucket %d off %x %p to %p\n", + i, (int)(*p-start), *p, end); + + switch (alg) { + case CRUSH_BUCKET_UNIFORM: + size = sizeof(struct crush_bucket_uniform); + break; + case CRUSH_BUCKET_LIST: + size = sizeof(struct crush_bucket_list); + break; + case CRUSH_BUCKET_TREE: + size = sizeof(struct crush_bucket_tree); + break; + case CRUSH_BUCKET_STRAW: + size = sizeof(struct crush_bucket_straw); + break; + default: + err = -EINVAL; + goto bad; + } + BUG_ON(size == 0); + b = c->buckets[i] = kzalloc(size, GFP_NOFS); + if (b == NULL) + goto badmem; + + ceph_decode_need(p, end, 4*sizeof(u32), bad); + b->id = ceph_decode_32(p); + b->type = ceph_decode_16(p); + b->alg = ceph_decode_8(p); + b->hash = ceph_decode_8(p); + b->weight = ceph_decode_32(p); + b->size = ceph_decode_32(p); + + dout("crush_decode bucket size %d off %x %p to %p\n", + b->size, (int)(*p-start), *p, end); + + b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS); + if (b->items == NULL) + goto badmem; + b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS); + if (b->perm == NULL) + goto badmem; + b->perm_n = 0; + + ceph_decode_need(p, end, b->size*sizeof(u32), bad); + for (j = 0; j < b->size; j++) + b->items[j] = ceph_decode_32(p); + + switch (b->alg) { + case CRUSH_BUCKET_UNIFORM: + err = crush_decode_uniform_bucket(p, end, + (struct crush_bucket_uniform *)b); + if (err < 0) + goto bad; + break; + case CRUSH_BUCKET_LIST: + err = crush_decode_list_bucket(p, end, + (struct crush_bucket_list *)b); + if (err < 0) + goto bad; + break; + case CRUSH_BUCKET_TREE: + err = crush_decode_tree_bucket(p, end, + (struct crush_bucket_tree *)b); + if (err < 0) + goto bad; + break; + case CRUSH_BUCKET_STRAW: + err = crush_decode_straw_bucket(p, end, + (struct crush_bucket_straw *)b); + if (err < 0) + goto bad; + break; + } + } + + /* rules */ + dout("rule vec is %p\n", c->rules); + for (i = 0; i < c->max_rules; i++) { + u32 yes; + struct crush_rule *r; + + ceph_decode_32_safe(p, end, yes, bad); + if (!yes) { + dout("crush_decode NO rule %d off %x %p to %p\n", + i, (int)(*p-start), *p, end); + c->rules[i] = NULL; + continue; + } + + dout("crush_decode rule %d off %x %p to %p\n", + i, (int)(*p-start), *p, end); + + /* len */ + ceph_decode_32_safe(p, end, yes, bad); +#if BITS_PER_LONG == 32 + err = -EINVAL; + if (yes > ULONG_MAX / sizeof(struct crush_rule_step)) + goto bad; +#endif + r = c->rules[i] = kmalloc(sizeof(*r) + + yes*sizeof(struct crush_rule_step), + GFP_NOFS); + if (r == NULL) + goto badmem; + dout(" rule %d is at %p\n", i, r); + r->len = yes; + ceph_decode_copy_safe(p, end, &r->mask, 4, bad); /* 4 u8's */ + ceph_decode_need(p, end, r->len*3*sizeof(u32), bad); + for (j = 0; j < r->len; j++) { + r->steps[j].op = ceph_decode_32(p); + r->steps[j].arg1 = ceph_decode_32(p); + r->steps[j].arg2 = ceph_decode_32(p); + } + } + + /* ignore trailing name maps. */ + + dout("crush_decode success\n"); + return c; + +badmem: + err = -ENOMEM; +bad: + dout("crush_decode fail %d\n", err); + crush_destroy(c); + return ERR_PTR(err); +} + + +/* + * osd map + */ +void ceph_osdmap_destroy(struct ceph_osdmap *map) +{ + dout("osdmap_destroy %p\n", map); + if (map->crush) + crush_destroy(map->crush); + while (!RB_EMPTY_ROOT(&map->pg_temp)) { + struct ceph_pg_mapping *pg = + rb_entry(rb_first(&map->pg_temp), + struct ceph_pg_mapping, node); + rb_erase(&pg->node, &map->pg_temp); + kfree(pg); + } + while (!RB_EMPTY_ROOT(&map->pg_pools)) { + struct ceph_pg_pool_info *pi = + rb_entry(rb_first(&map->pg_pools), + struct ceph_pg_pool_info, node); + rb_erase(&pi->node, &map->pg_pools); + kfree(pi); + } + kfree(map->osd_state); + kfree(map->osd_weight); + kfree(map->osd_addr); + kfree(map); +} + +/* + * adjust max osd value. reallocate arrays. + */ +static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) +{ + u8 *state; + struct ceph_entity_addr *addr; + u32 *weight; + + state = kcalloc(max, sizeof(*state), GFP_NOFS); + addr = kcalloc(max, sizeof(*addr), GFP_NOFS); + weight = kcalloc(max, sizeof(*weight), GFP_NOFS); + if (state == NULL || addr == NULL || weight == NULL) { + kfree(state); + kfree(addr); + kfree(weight); + return -ENOMEM; + } + + /* copy old? */ + if (map->osd_state) { + memcpy(state, map->osd_state, map->max_osd*sizeof(*state)); + memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr)); + memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight)); + kfree(map->osd_state); + kfree(map->osd_addr); + kfree(map->osd_weight); + } + + map->osd_state = state; + map->osd_weight = weight; + map->osd_addr = addr; + map->max_osd = max; + return 0; +} + +/* + * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid + * to a set of osds) + */ +static int pgid_cmp(struct ceph_pg l, struct ceph_pg r) +{ + u64 a = *(u64 *)&l; + u64 b = *(u64 *)&r; + + if (a < b) + return -1; + if (a > b) + return 1; + return 0; +} + +static int __insert_pg_mapping(struct ceph_pg_mapping *new, + struct rb_root *root) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct ceph_pg_mapping *pg = NULL; + int c; + + while (*p) { + parent = *p; + pg = rb_entry(parent, struct ceph_pg_mapping, node); + c = pgid_cmp(new->pgid, pg->pgid); + if (c < 0) + p = &(*p)->rb_left; + else if (c > 0) + p = &(*p)->rb_right; + else + return -EEXIST; + } + + rb_link_node(&new->node, parent, p); + rb_insert_color(&new->node, root); + return 0; +} + +static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root, + struct ceph_pg pgid) +{ + struct rb_node *n = root->rb_node; + struct ceph_pg_mapping *pg; + int c; + + while (n) { + pg = rb_entry(n, struct ceph_pg_mapping, node); + c = pgid_cmp(pgid, pg->pgid); + if (c < 0) + n = n->rb_left; + else if (c > 0) + n = n->rb_right; + else + return pg; + } + return NULL; +} + +/* + * rbtree of pg pool info + */ +static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct ceph_pg_pool_info *pi = NULL; + + while (*p) { + parent = *p; + pi = rb_entry(parent, struct ceph_pg_pool_info, node); + if (new->id < pi->id) + p = &(*p)->rb_left; + else if (new->id > pi->id) + p = &(*p)->rb_right; + else + return -EEXIST; + } + + rb_link_node(&new->node, parent, p); + rb_insert_color(&new->node, root); + return 0; +} + +static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id) +{ + struct ceph_pg_pool_info *pi; + struct rb_node *n = root->rb_node; + + while (n) { + pi = rb_entry(n, struct ceph_pg_pool_info, node); + if (id < pi->id) + n = n->rb_left; + else if (id > pi->id) + n = n->rb_right; + else + return pi; + } + return NULL; +} + +/* + * decode a full map. + */ +struct ceph_osdmap *osdmap_decode(void **p, void *end) +{ + struct ceph_osdmap *map; + u16 version; + u32 len, max, i; + u8 ev; + int err = -EINVAL; + void *start = *p; + struct ceph_pg_pool_info *pi; + + dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p)); + + map = kzalloc(sizeof(*map), GFP_NOFS); + if (map == NULL) + return ERR_PTR(-ENOMEM); + map->pg_temp = RB_ROOT; + + ceph_decode_16_safe(p, end, version, bad); + if (version > CEPH_OSDMAP_VERSION) { + pr_warning("got unknown v %d > %d of osdmap\n", version, + CEPH_OSDMAP_VERSION); + goto bad; + } + + ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), bad); + ceph_decode_copy(p, &map->fsid, sizeof(map->fsid)); + map->epoch = ceph_decode_32(p); + ceph_decode_copy(p, &map->created, sizeof(map->created)); + ceph_decode_copy(p, &map->modified, sizeof(map->modified)); + + ceph_decode_32_safe(p, end, max, bad); + while (max--) { + ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad); + pi = kmalloc(sizeof(*pi), GFP_NOFS); + if (!pi) + goto bad; + pi->id = ceph_decode_32(p); + ev = ceph_decode_8(p); /* encoding version */ + if (ev > CEPH_PG_POOL_VERSION) { + pr_warning("got unknown v %d > %d of ceph_pg_pool\n", + ev, CEPH_PG_POOL_VERSION); + goto bad; + } + ceph_decode_copy(p, &pi->v, sizeof(pi->v)); + __insert_pg_pool(&map->pg_pools, pi); + calc_pg_masks(pi); + *p += le32_to_cpu(pi->v.num_snaps) * sizeof(u64); + *p += le32_to_cpu(pi->v.num_removed_snap_intervals) + * sizeof(u64) * 2; + } + ceph_decode_32_safe(p, end, map->pool_max, bad); + + ceph_decode_32_safe(p, end, map->flags, bad); + + max = ceph_decode_32(p); + + /* (re)alloc osd arrays */ + err = osdmap_set_max_osd(map, max); + if (err < 0) + goto bad; + dout("osdmap_decode max_osd = %d\n", map->max_osd); + + /* osds */ + err = -EINVAL; + ceph_decode_need(p, end, 3*sizeof(u32) + + map->max_osd*(1 + sizeof(*map->osd_weight) + + sizeof(*map->osd_addr)), bad); + *p += 4; /* skip length field (should match max) */ + ceph_decode_copy(p, map->osd_state, map->max_osd); + + *p += 4; /* skip length field (should match max) */ + for (i = 0; i < map->max_osd; i++) + map->osd_weight[i] = ceph_decode_32(p); + + *p += 4; /* skip length field (should match max) */ + ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr)); + for (i = 0; i < map->max_osd; i++) + ceph_decode_addr(&map->osd_addr[i]); + + /* pg_temp */ + ceph_decode_32_safe(p, end, len, bad); + for (i = 0; i < len; i++) { + int n, j; + struct ceph_pg pgid; + struct ceph_pg_mapping *pg; + + ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad); + ceph_decode_copy(p, &pgid, sizeof(pgid)); + n = ceph_decode_32(p); + ceph_decode_need(p, end, n * sizeof(u32), bad); + err = -ENOMEM; + pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS); + if (!pg) + goto bad; + pg->pgid = pgid; + pg->len = n; + for (j = 0; j < n; j++) + pg->osds[j] = ceph_decode_32(p); + + err = __insert_pg_mapping(pg, &map->pg_temp); + if (err) + goto bad; + dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, len); + } + + /* crush */ + ceph_decode_32_safe(p, end, len, bad); + dout("osdmap_decode crush len %d from off 0x%x\n", len, + (int)(*p - start)); + ceph_decode_need(p, end, len, bad); + map->crush = crush_decode(*p, end); + *p += len; + if (IS_ERR(map->crush)) { + err = PTR_ERR(map->crush); + map->crush = NULL; + goto bad; + } + + /* ignore the rest of the map */ + *p = end; + + dout("osdmap_decode done %p %p\n", *p, end); + return map; + +bad: + dout("osdmap_decode fail\n"); + ceph_osdmap_destroy(map); + return ERR_PTR(err); +} + +/* + * decode and apply an incremental map update. + */ +struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, + struct ceph_osdmap *map, + struct ceph_messenger *msgr) +{ + struct crush_map *newcrush = NULL; + struct ceph_fsid fsid; + u32 epoch = 0; + struct ceph_timespec modified; + u32 len, pool; + __s32 new_pool_max, new_flags, max; + void *start = *p; + int err = -EINVAL; + u16 version; + struct rb_node *rbp; + + ceph_decode_16_safe(p, end, version, bad); + if (version > CEPH_OSDMAP_INC_VERSION) { + pr_warning("got unknown v %d > %d of inc osdmap\n", version, + CEPH_OSDMAP_INC_VERSION); + goto bad; + } + + ceph_decode_need(p, end, sizeof(fsid)+sizeof(modified)+2*sizeof(u32), + bad); + ceph_decode_copy(p, &fsid, sizeof(fsid)); + epoch = ceph_decode_32(p); + BUG_ON(epoch != map->epoch+1); + ceph_decode_copy(p, &modified, sizeof(modified)); + new_pool_max = ceph_decode_32(p); + new_flags = ceph_decode_32(p); + + /* full map? */ + ceph_decode_32_safe(p, end, len, bad); + if (len > 0) { + dout("apply_incremental full map len %d, %p to %p\n", + len, *p, end); + return osdmap_decode(p, min(*p+len, end)); + } + + /* new crush? */ + ceph_decode_32_safe(p, end, len, bad); + if (len > 0) { + dout("apply_incremental new crush map len %d, %p to %p\n", + len, *p, end); + newcrush = crush_decode(*p, min(*p+len, end)); + if (IS_ERR(newcrush)) + return ERR_PTR(PTR_ERR(newcrush)); + } + + /* new flags? */ + if (new_flags >= 0) + map->flags = new_flags; + if (new_pool_max >= 0) + map->pool_max = new_pool_max; + + ceph_decode_need(p, end, 5*sizeof(u32), bad); + + /* new max? */ + max = ceph_decode_32(p); + if (max >= 0) { + err = osdmap_set_max_osd(map, max); + if (err < 0) + goto bad; + } + + map->epoch++; + map->modified = map->modified; + if (newcrush) { + if (map->crush) + crush_destroy(map->crush); + map->crush = newcrush; + newcrush = NULL; + } + + /* new_pool */ + ceph_decode_32_safe(p, end, len, bad); + while (len--) { + __u8 ev; + struct ceph_pg_pool_info *pi; + + ceph_decode_32_safe(p, end, pool, bad); + ceph_decode_need(p, end, 1 + sizeof(pi->v), bad); + ev = ceph_decode_8(p); /* encoding version */ + if (ev > CEPH_PG_POOL_VERSION) { + pr_warning("got unknown v %d > %d of ceph_pg_pool\n", + ev, CEPH_PG_POOL_VERSION); + goto bad; + } + pi = __lookup_pg_pool(&map->pg_pools, pool); + if (!pi) { + pi = kmalloc(sizeof(*pi), GFP_NOFS); + if (!pi) { + err = -ENOMEM; + goto bad; + } + pi->id = pool; + __insert_pg_pool(&map->pg_pools, pi); + } + ceph_decode_copy(p, &pi->v, sizeof(pi->v)); + calc_pg_masks(pi); + } + + /* old_pool */ + ceph_decode_32_safe(p, end, len, bad); + while (len--) { + struct ceph_pg_pool_info *pi; + + ceph_decode_32_safe(p, end, pool, bad); + pi = __lookup_pg_pool(&map->pg_pools, pool); + if (pi) { + rb_erase(&pi->node, &map->pg_pools); + kfree(pi); + } + } + + /* new_up */ + err = -EINVAL; + ceph_decode_32_safe(p, end, len, bad); + while (len--) { + u32 osd; + struct ceph_entity_addr addr; + ceph_decode_32_safe(p, end, osd, bad); + ceph_decode_copy_safe(p, end, &addr, sizeof(addr), bad); + ceph_decode_addr(&addr); + pr_info("osd%d up\n", osd); + BUG_ON(osd >= map->max_osd); + map->osd_state[osd] |= CEPH_OSD_UP; + map->osd_addr[osd] = addr; + } + + /* new_down */ + ceph_decode_32_safe(p, end, len, bad); + while (len--) { + u32 osd; + ceph_decode_32_safe(p, end, osd, bad); + (*p)++; /* clean flag */ + pr_info("osd%d down\n", osd); + if (osd < map->max_osd) + map->osd_state[osd] &= ~CEPH_OSD_UP; + } + + /* new_weight */ + ceph_decode_32_safe(p, end, len, bad); + while (len--) { + u32 osd, off; + ceph_decode_need(p, end, sizeof(u32)*2, bad); + osd = ceph_decode_32(p); + off = ceph_decode_32(p); + pr_info("osd%d weight 0x%x %s\n", osd, off, + off == CEPH_OSD_IN ? "(in)" : + (off == CEPH_OSD_OUT ? "(out)" : "")); + if (osd < map->max_osd) + map->osd_weight[osd] = off; + } + + /* new_pg_temp */ + rbp = rb_first(&map->pg_temp); + ceph_decode_32_safe(p, end, len, bad); + while (len--) { + struct ceph_pg_mapping *pg; + int j; + struct ceph_pg pgid; + u32 pglen; + ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad); + ceph_decode_copy(p, &pgid, sizeof(pgid)); + pglen = ceph_decode_32(p); + + /* remove any? */ + while (rbp && pgid_cmp(rb_entry(rbp, struct ceph_pg_mapping, + node)->pgid, pgid) <= 0) { + struct rb_node *cur = rbp; + rbp = rb_next(rbp); + dout(" removed pg_temp %llx\n", + *(u64 *)&rb_entry(cur, struct ceph_pg_mapping, + node)->pgid); + rb_erase(cur, &map->pg_temp); + } + + if (pglen) { + /* insert */ + ceph_decode_need(p, end, pglen*sizeof(u32), bad); + pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS); + if (!pg) { + err = -ENOMEM; + goto bad; + } + pg->pgid = pgid; + pg->len = pglen; + for (j = 0; j < pglen; j++) + pg->osds[j] = ceph_decode_32(p); + err = __insert_pg_mapping(pg, &map->pg_temp); + if (err) + goto bad; + dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, + pglen); + } + } + while (rbp) { + struct rb_node *cur = rbp; + rbp = rb_next(rbp); + dout(" removed pg_temp %llx\n", + *(u64 *)&rb_entry(cur, struct ceph_pg_mapping, + node)->pgid); + rb_erase(cur, &map->pg_temp); + } + + /* ignore the rest */ + *p = end; + return map; + +bad: + pr_err("corrupt inc osdmap epoch %d off %d (%p of %p-%p)\n", + epoch, (int)(*p - start), *p, start, end); + print_hex_dump(KERN_DEBUG, "osdmap: ", + DUMP_PREFIX_OFFSET, 16, 1, + start, end - start, true); + if (newcrush) + crush_destroy(newcrush); + return ERR_PTR(err); +} + + + + +/* + * calculate file layout from given offset, length. + * fill in correct oid, logical length, and object extent + * offset, length. + * + * for now, we write only a single su, until we can + * pass a stride back to the caller. + */ +void ceph_calc_file_object_mapping(struct ceph_file_layout *layout, + u64 off, u64 *plen, + u64 *ono, + u64 *oxoff, u64 *oxlen) +{ + u32 osize = le32_to_cpu(layout->fl_object_size); + u32 su = le32_to_cpu(layout->fl_stripe_unit); + u32 sc = le32_to_cpu(layout->fl_stripe_count); + u32 bl, stripeno, stripepos, objsetno; + u32 su_per_object; + u64 t, su_offset; + + dout("mapping %llu~%llu osize %u fl_su %u\n", off, *plen, + osize, su); + su_per_object = osize / su; + dout("osize %u / su %u = su_per_object %u\n", osize, su, + su_per_object); + + BUG_ON((su & ~PAGE_MASK) != 0); + /* bl = *off / su; */ + t = off; + do_div(t, su); + bl = t; + dout("off %llu / su %u = bl %u\n", off, su, bl); + + stripeno = bl / sc; + stripepos = bl % sc; + objsetno = stripeno / su_per_object; + + *ono = objsetno * sc + stripepos; + dout("objset %u * sc %u = ono %u\n", objsetno, sc, (unsigned)*ono); + + /* *oxoff = *off % layout->fl_stripe_unit; # offset in su */ + t = off; + su_offset = do_div(t, su); + *oxoff = su_offset + (stripeno % su_per_object) * su; + + /* + * Calculate the length of the extent being written to the selected + * object. This is the minimum of the full length requested (plen) or + * the remainder of the current stripe being written to. + */ + *oxlen = min_t(u64, *plen, su - su_offset); + *plen = *oxlen; + + dout(" obj extent %llu~%llu\n", *oxoff, *oxlen); +} + +/* + * calculate an object layout (i.e. pgid) from an oid, + * file_layout, and osdmap + */ +int ceph_calc_object_layout(struct ceph_object_layout *ol, + const char *oid, + struct ceph_file_layout *fl, + struct ceph_osdmap *osdmap) +{ + unsigned num, num_mask; + struct ceph_pg pgid; + s32 preferred = (s32)le32_to_cpu(fl->fl_pg_preferred); + int poolid = le32_to_cpu(fl->fl_pg_pool); + struct ceph_pg_pool_info *pool; + unsigned ps; + + BUG_ON(!osdmap); + + pool = __lookup_pg_pool(&osdmap->pg_pools, poolid); + if (!pool) + return -EIO; + ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid)); + if (preferred >= 0) { + ps += preferred; + num = le32_to_cpu(pool->v.lpg_num); + num_mask = pool->lpg_num_mask; + } else { + num = le32_to_cpu(pool->v.pg_num); + num_mask = pool->pg_num_mask; + } + + pgid.ps = cpu_to_le16(ps); + pgid.preferred = cpu_to_le16(preferred); + pgid.pool = fl->fl_pg_pool; + if (preferred >= 0) + dout("calc_object_layout '%s' pgid %d.%xp%d\n", oid, poolid, ps, + (int)preferred); + else + dout("calc_object_layout '%s' pgid %d.%x\n", oid, poolid, ps); + + ol->ol_pgid = pgid; + ol->ol_stripe_unit = fl->fl_object_stripe_unit; + return 0; +} + +/* + * Calculate raw osd vector for the given pgid. Return pointer to osd + * array, or NULL on failure. + */ +static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid, + int *osds, int *num) +{ + struct ceph_pg_mapping *pg; + struct ceph_pg_pool_info *pool; + int ruleno; + unsigned poolid, ps, pps; + int preferred; + + /* pg_temp? */ + pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid); + if (pg) { + *num = pg->len; + return pg->osds; + } + + /* crush */ + poolid = le32_to_cpu(pgid.pool); + ps = le16_to_cpu(pgid.ps); + preferred = (s16)le16_to_cpu(pgid.preferred); + + /* don't forcefeed bad device ids to crush */ + if (preferred >= osdmap->max_osd || + preferred >= osdmap->crush->max_devices) + preferred = -1; + + pool = __lookup_pg_pool(&osdmap->pg_pools, poolid); + if (!pool) + return NULL; + ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset, + pool->v.type, pool->v.size); + if (ruleno < 0) { + pr_err("no crush rule pool %d type %d size %d\n", + poolid, pool->v.type, pool->v.size); + return NULL; + } + + if (preferred >= 0) + pps = ceph_stable_mod(ps, + le32_to_cpu(pool->v.lpgp_num), + pool->lpgp_num_mask); + else + pps = ceph_stable_mod(ps, + le32_to_cpu(pool->v.pgp_num), + pool->pgp_num_mask); + pps += poolid; + *num = crush_do_rule(osdmap->crush, ruleno, pps, osds, + min_t(int, pool->v.size, *num), + preferred, osdmap->osd_weight); + return osds; +} + +/* + * Return primary osd for given pgid, or -1 if none. + */ +int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid) +{ + int rawosds[10], *osds; + int i, num = ARRAY_SIZE(rawosds); + + osds = calc_pg_raw(osdmap, pgid, rawosds, &num); + if (!osds) + return -1; + + /* primary is first up osd */ + for (i = 0; i < num; i++) + if (ceph_osd_is_up(osdmap, osds[i])) { + return osds[i]; + break; + } + return -1; +} diff --git a/fs/ceph/osdmap.h b/fs/ceph/osdmap.h new file mode 100644 index 00000000000..1fb55afb264 --- /dev/null +++ b/fs/ceph/osdmap.h @@ -0,0 +1,125 @@ +#ifndef _FS_CEPH_OSDMAP_H +#define _FS_CEPH_OSDMAP_H + +#include <linux/rbtree.h> +#include "types.h" +#include "ceph_fs.h" +#include "crush/crush.h" + +/* + * The osd map describes the current membership of the osd cluster and + * specifies the mapping of objects to placement groups and placement + * groups to (sets of) osds. That is, it completely specifies the + * (desired) distribution of all data objects in the system at some + * point in time. + * + * Each map version is identified by an epoch, which increases monotonically. + * + * The map can be updated either via an incremental map (diff) describing + * the change between two successive epochs, or as a fully encoded map. + */ +struct ceph_pg_pool_info { + struct rb_node node; + int id; + struct ceph_pg_pool v; + int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask; +}; + +struct ceph_pg_mapping { + struct rb_node node; + struct ceph_pg pgid; + int len; + int osds[]; +}; + +struct ceph_osdmap { + struct ceph_fsid fsid; + u32 epoch; + u32 mkfs_epoch; + struct ceph_timespec created, modified; + + u32 flags; /* CEPH_OSDMAP_* */ + + u32 max_osd; /* size of osd_state, _offload, _addr arrays */ + u8 *osd_state; /* CEPH_OSD_* */ + u32 *osd_weight; /* 0 = failed, 0x10000 = 100% normal */ + struct ceph_entity_addr *osd_addr; + + struct rb_root pg_temp; + struct rb_root pg_pools; + u32 pool_max; + + /* the CRUSH map specifies the mapping of placement groups to + * the list of osds that store+replicate them. */ + struct crush_map *crush; +}; + +/* + * file layout helpers + */ +#define ceph_file_layout_su(l) ((__s32)le32_to_cpu((l).fl_stripe_unit)) +#define ceph_file_layout_stripe_count(l) \ + ((__s32)le32_to_cpu((l).fl_stripe_count)) +#define ceph_file_layout_object_size(l) ((__s32)le32_to_cpu((l).fl_object_size)) +#define ceph_file_layout_cas_hash(l) ((__s32)le32_to_cpu((l).fl_cas_hash)) +#define ceph_file_layout_object_su(l) \ + ((__s32)le32_to_cpu((l).fl_object_stripe_unit)) +#define ceph_file_layout_pg_preferred(l) \ + ((__s32)le32_to_cpu((l).fl_pg_preferred)) +#define ceph_file_layout_pg_pool(l) \ + ((__s32)le32_to_cpu((l).fl_pg_pool)) + +static inline unsigned ceph_file_layout_stripe_width(struct ceph_file_layout *l) +{ + return le32_to_cpu(l->fl_stripe_unit) * + le32_to_cpu(l->fl_stripe_count); +} + +/* "period" == bytes before i start on a new set of objects */ +static inline unsigned ceph_file_layout_period(struct ceph_file_layout *l) +{ + return le32_to_cpu(l->fl_object_size) * + le32_to_cpu(l->fl_stripe_count); +} + + +static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd) +{ + return (osd < map->max_osd) && (map->osd_state[osd] & CEPH_OSD_UP); +} + +static inline bool ceph_osdmap_flag(struct ceph_osdmap *map, int flag) +{ + return map && (map->flags & flag); +} + +extern char *ceph_osdmap_state_str(char *str, int len, int state); + +static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map, + int osd) +{ + if (osd >= map->max_osd) + return NULL; + return &map->osd_addr[osd]; +} + +extern struct ceph_osdmap *osdmap_decode(void **p, void *end); +extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, + struct ceph_osdmap *map, + struct ceph_messenger *msgr); +extern void ceph_osdmap_destroy(struct ceph_osdmap *map); + +/* calculate mapping of a file extent to an object */ +extern void ceph_calc_file_object_mapping(struct ceph_file_layout *layout, + u64 off, u64 *plen, + u64 *bno, u64 *oxoff, u64 *oxlen); + +/* calculate mapping of object to a placement group */ +extern int ceph_calc_object_layout(struct ceph_object_layout *ol, + const char *oid, + struct ceph_file_layout *fl, + struct ceph_osdmap *osdmap); +extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, + struct ceph_pg pgid); + +#endif diff --git a/fs/ceph/pagelist.c b/fs/ceph/pagelist.c new file mode 100644 index 00000000000..370e9369547 --- /dev/null +++ b/fs/ceph/pagelist.c @@ -0,0 +1,54 @@ + +#include <linux/pagemap.h> +#include <linux/highmem.h> + +#include "pagelist.h" + +int ceph_pagelist_release(struct ceph_pagelist *pl) +{ + if (pl->mapped_tail) + kunmap(pl->mapped_tail); + while (!list_empty(&pl->head)) { + struct page *page = list_first_entry(&pl->head, struct page, + lru); + list_del(&page->lru); + __free_page(page); + } + return 0; +} + +static int ceph_pagelist_addpage(struct ceph_pagelist *pl) +{ + struct page *page = alloc_page(GFP_NOFS); + if (!page) + return -ENOMEM; + pl->room += PAGE_SIZE; + list_add_tail(&page->lru, &pl->head); + if (pl->mapped_tail) + kunmap(pl->mapped_tail); + pl->mapped_tail = kmap(page); + return 0; +} + +int ceph_pagelist_append(struct ceph_pagelist *pl, void *buf, size_t len) +{ + while (pl->room < len) { + size_t bit = pl->room; + int ret; + + memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), + buf, bit); + pl->length += bit; + pl->room -= bit; + buf += bit; + len -= bit; + ret = ceph_pagelist_addpage(pl); + if (ret) + return ret; + } + + memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), buf, len); + pl->length += len; + pl->room -= len; + return 0; +} diff --git a/fs/ceph/pagelist.h b/fs/ceph/pagelist.h new file mode 100644 index 00000000000..e8a4187e108 --- /dev/null +++ b/fs/ceph/pagelist.h @@ -0,0 +1,54 @@ +#ifndef __FS_CEPH_PAGELIST_H +#define __FS_CEPH_PAGELIST_H + +#include <linux/list.h> + +struct ceph_pagelist { + struct list_head head; + void *mapped_tail; + size_t length; + size_t room; +}; + +static inline void ceph_pagelist_init(struct ceph_pagelist *pl) +{ + INIT_LIST_HEAD(&pl->head); + pl->mapped_tail = NULL; + pl->length = 0; + pl->room = 0; +} +extern int ceph_pagelist_release(struct ceph_pagelist *pl); + +extern int ceph_pagelist_append(struct ceph_pagelist *pl, void *d, size_t l); + +static inline int ceph_pagelist_encode_64(struct ceph_pagelist *pl, u64 v) +{ + __le64 ev = cpu_to_le64(v); + return ceph_pagelist_append(pl, &ev, sizeof(ev)); +} +static inline int ceph_pagelist_encode_32(struct ceph_pagelist *pl, u32 v) +{ + __le32 ev = cpu_to_le32(v); + return ceph_pagelist_append(pl, &ev, sizeof(ev)); +} +static inline int ceph_pagelist_encode_16(struct ceph_pagelist *pl, u16 v) +{ + __le16 ev = cpu_to_le16(v); + return ceph_pagelist_append(pl, &ev, sizeof(ev)); +} +static inline int ceph_pagelist_encode_8(struct ceph_pagelist *pl, u8 v) +{ + return ceph_pagelist_append(pl, &v, 1); +} +static inline int ceph_pagelist_encode_string(struct ceph_pagelist *pl, + char *s, size_t len) +{ + int ret = ceph_pagelist_encode_32(pl, len); + if (ret) + return ret; + if (len) + return ceph_pagelist_append(pl, s, len); + return 0; +} + +#endif diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h new file mode 100644 index 00000000000..26ac8b89a67 --- /dev/null +++ b/fs/ceph/rados.h @@ -0,0 +1,374 @@ +#ifndef __RADOS_H +#define __RADOS_H + +/* + * Data types for the Ceph distributed object storage layer RADOS + * (Reliable Autonomic Distributed Object Store). + */ + +#include "msgr.h" + +/* + * osdmap encoding versions + */ +#define CEPH_OSDMAP_INC_VERSION 4 +#define CEPH_OSDMAP_VERSION 4 + +/* + * fs id + */ +struct ceph_fsid { + unsigned char fsid[16]; +}; + +static inline int ceph_fsid_compare(const struct ceph_fsid *a, + const struct ceph_fsid *b) +{ + return memcmp(a, b, sizeof(*a)); +} + +/* + * ino, object, etc. + */ +typedef __le64 ceph_snapid_t; +#define CEPH_SNAPDIR ((__u64)(-1)) /* reserved for hidden .snap dir */ +#define CEPH_NOSNAP ((__u64)(-2)) /* "head", "live" revision */ +#define CEPH_MAXSNAP ((__u64)(-3)) /* largest valid snapid */ + +struct ceph_timespec { + __le32 tv_sec; + __le32 tv_nsec; +} __attribute__ ((packed)); + + +/* + * object layout - how objects are mapped into PGs + */ +#define CEPH_OBJECT_LAYOUT_HASH 1 +#define CEPH_OBJECT_LAYOUT_LINEAR 2 +#define CEPH_OBJECT_LAYOUT_HASHINO 3 + +/* + * pg layout -- how PGs are mapped onto (sets of) OSDs + */ +#define CEPH_PG_LAYOUT_CRUSH 0 +#define CEPH_PG_LAYOUT_HASH 1 +#define CEPH_PG_LAYOUT_LINEAR 2 +#define CEPH_PG_LAYOUT_HYBRID 3 + + +/* + * placement group. + * we encode this into one __le64. + */ +struct ceph_pg { + __le16 preferred; /* preferred primary osd */ + __le16 ps; /* placement seed */ + __le32 pool; /* object pool */ +} __attribute__ ((packed)); + +/* + * pg_pool is a set of pgs storing a pool of objects + * + * pg_num -- base number of pseudorandomly placed pgs + * + * pgp_num -- effective number when calculating pg placement. this + * is used for pg_num increases. new pgs result in data being "split" + * into new pgs. for this to proceed smoothly, new pgs are intiially + * colocated with their parents; that is, pgp_num doesn't increase + * until the new pgs have successfully split. only _then_ are the new + * pgs placed independently. + * + * lpg_num -- localized pg count (per device). replicas are randomly + * selected. + * + * lpgp_num -- as above. + */ +#define CEPH_PG_TYPE_REP 1 +#define CEPH_PG_TYPE_RAID4 2 +#define CEPH_PG_POOL_VERSION 2 +struct ceph_pg_pool { + __u8 type; /* CEPH_PG_TYPE_* */ + __u8 size; /* number of osds in each pg */ + __u8 crush_ruleset; /* crush placement rule */ + __u8 object_hash; /* hash mapping object name to ps */ + __le32 pg_num, pgp_num; /* number of pg's */ + __le32 lpg_num, lpgp_num; /* number of localized pg's */ + __le32 last_change; /* most recent epoch changed */ + __le64 snap_seq; /* seq for per-pool snapshot */ + __le32 snap_epoch; /* epoch of last snap */ + __le32 num_snaps; + __le32 num_removed_snap_intervals; + __le64 uid; +} __attribute__ ((packed)); + +/* + * stable_mod func is used to control number of placement groups. + * similar to straight-up modulo, but produces a stable mapping as b + * increases over time. b is the number of bins, and bmask is the + * containing power of 2 minus 1. + * + * b <= bmask and bmask=(2**n)-1 + * e.g., b=12 -> bmask=15, b=123 -> bmask=127 + */ +static inline int ceph_stable_mod(int x, int b, int bmask) +{ + if ((x & bmask) < b) + return x & bmask; + else + return x & (bmask >> 1); +} + +/* + * object layout - how a given object should be stored. + */ +struct ceph_object_layout { + struct ceph_pg ol_pgid; /* raw pg, with _full_ ps precision. */ + __le32 ol_stripe_unit; /* for per-object parity, if any */ +} __attribute__ ((packed)); + +/* + * compound epoch+version, used by storage layer to serialize mutations + */ +struct ceph_eversion { + __le32 epoch; + __le64 version; +} __attribute__ ((packed)); + +/* + * osd map bits + */ + +/* status bits */ +#define CEPH_OSD_EXISTS 1 +#define CEPH_OSD_UP 2 + +/* osd weights. fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */ +#define CEPH_OSD_IN 0x10000 +#define CEPH_OSD_OUT 0 + + +/* + * osd map flag bits + */ +#define CEPH_OSDMAP_NEARFULL (1<<0) /* sync writes (near ENOSPC) */ +#define CEPH_OSDMAP_FULL (1<<1) /* no data writes (ENOSPC) */ +#define CEPH_OSDMAP_PAUSERD (1<<2) /* pause all reads */ +#define CEPH_OSDMAP_PAUSEWR (1<<3) /* pause all writes */ +#define CEPH_OSDMAP_PAUSEREC (1<<4) /* pause recovery */ + +/* + * osd ops + */ +#define CEPH_OSD_OP_MODE 0xf000 +#define CEPH_OSD_OP_MODE_RD 0x1000 +#define CEPH_OSD_OP_MODE_WR 0x2000 +#define CEPH_OSD_OP_MODE_RMW 0x3000 +#define CEPH_OSD_OP_MODE_SUB 0x4000 + +#define CEPH_OSD_OP_TYPE 0x0f00 +#define CEPH_OSD_OP_TYPE_LOCK 0x0100 +#define CEPH_OSD_OP_TYPE_DATA 0x0200 +#define CEPH_OSD_OP_TYPE_ATTR 0x0300 +#define CEPH_OSD_OP_TYPE_EXEC 0x0400 +#define CEPH_OSD_OP_TYPE_PG 0x0500 + +enum { + /** data **/ + /* read */ + CEPH_OSD_OP_READ = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 1, + CEPH_OSD_OP_STAT = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 2, + + /* fancy read */ + CEPH_OSD_OP_MASKTRUNC = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 4, + + /* write */ + CEPH_OSD_OP_WRITE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 1, + CEPH_OSD_OP_WRITEFULL = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 2, + CEPH_OSD_OP_TRUNCATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 3, + CEPH_OSD_OP_ZERO = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 4, + CEPH_OSD_OP_DELETE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 5, + + /* fancy write */ + CEPH_OSD_OP_APPEND = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 6, + CEPH_OSD_OP_STARTSYNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 7, + CEPH_OSD_OP_SETTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 8, + CEPH_OSD_OP_TRIMTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 9, + + CEPH_OSD_OP_TMAPUP = CEPH_OSD_OP_MODE_RMW | CEPH_OSD_OP_TYPE_DATA | 10, + CEPH_OSD_OP_TMAPPUT = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 11, + CEPH_OSD_OP_TMAPGET = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 12, + + CEPH_OSD_OP_CREATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 13, + + /** attrs **/ + /* read */ + CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1, + CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2, + + /* write */ + CEPH_OSD_OP_SETXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1, + CEPH_OSD_OP_SETXATTRS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 2, + CEPH_OSD_OP_RESETXATTRS = CEPH_OSD_OP_MODE_WR|CEPH_OSD_OP_TYPE_ATTR | 3, + CEPH_OSD_OP_RMXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 4, + + /** subop **/ + CEPH_OSD_OP_PULL = CEPH_OSD_OP_MODE_SUB | 1, + CEPH_OSD_OP_PUSH = CEPH_OSD_OP_MODE_SUB | 2, + CEPH_OSD_OP_BALANCEREADS = CEPH_OSD_OP_MODE_SUB | 3, + CEPH_OSD_OP_UNBALANCEREADS = CEPH_OSD_OP_MODE_SUB | 4, + CEPH_OSD_OP_SCRUB = CEPH_OSD_OP_MODE_SUB | 5, + + /** lock **/ + CEPH_OSD_OP_WRLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1, + CEPH_OSD_OP_WRUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 2, + CEPH_OSD_OP_RDLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 3, + CEPH_OSD_OP_RDUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 4, + CEPH_OSD_OP_UPLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 5, + CEPH_OSD_OP_DNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 6, + + /** exec **/ + CEPH_OSD_OP_CALL = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_EXEC | 1, + + /** pg **/ + CEPH_OSD_OP_PGLS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 1, +}; + +static inline int ceph_osd_op_type_lock(int op) +{ + return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_LOCK; +} +static inline int ceph_osd_op_type_data(int op) +{ + return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_DATA; +} +static inline int ceph_osd_op_type_attr(int op) +{ + return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_ATTR; +} +static inline int ceph_osd_op_type_exec(int op) +{ + return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_EXEC; +} +static inline int ceph_osd_op_type_pg(int op) +{ + return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG; +} + +static inline int ceph_osd_op_mode_subop(int op) +{ + return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_SUB; +} +static inline int ceph_osd_op_mode_read(int op) +{ + return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_RD; +} +static inline int ceph_osd_op_mode_modify(int op) +{ + return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR; +} + +#define CEPH_OSD_TMAP_HDR 'h' +#define CEPH_OSD_TMAP_SET 's' +#define CEPH_OSD_TMAP_RM 'r' + +extern const char *ceph_osd_op_name(int op); + + +/* + * osd op flags + * + * An op may be READ, WRITE, or READ|WRITE. + */ +enum { + CEPH_OSD_FLAG_ACK = 1, /* want (or is) "ack" ack */ + CEPH_OSD_FLAG_ONNVRAM = 2, /* want (or is) "onnvram" ack */ + CEPH_OSD_FLAG_ONDISK = 4, /* want (or is) "ondisk" ack */ + CEPH_OSD_FLAG_RETRY = 8, /* resend attempt */ + CEPH_OSD_FLAG_READ = 16, /* op may read */ + CEPH_OSD_FLAG_WRITE = 32, /* op may write */ + CEPH_OSD_FLAG_ORDERSNAP = 64, /* EOLDSNAP if snapc is out of order */ + CEPH_OSD_FLAG_PEERSTAT = 128, /* msg includes osd_peer_stat */ + CEPH_OSD_FLAG_BALANCE_READS = 256, + CEPH_OSD_FLAG_PARALLELEXEC = 512, /* execute op in parallel */ + CEPH_OSD_FLAG_PGOP = 1024, /* pg op, no object */ + CEPH_OSD_FLAG_EXEC = 2048, /* op may exec */ +}; + +enum { + CEPH_OSD_OP_FLAG_EXCL = 1, /* EXCL object create */ +}; + +#define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/ +#define EBLACKLISTED ESHUTDOWN /* blacklisted */ + +/* + * an individual object operation. each may be accompanied by some data + * payload + */ +struct ceph_osd_op { + __le16 op; /* CEPH_OSD_OP_* */ + __le32 flags; /* CEPH_OSD_FLAG_* */ + union { + struct { + __le64 offset, length; + __le64 truncate_size; + __le32 truncate_seq; + } __attribute__ ((packed)) extent; + struct { + __le32 name_len; + __le32 value_len; + } __attribute__ ((packed)) xattr; + struct { + __u8 class_len; + __u8 method_len; + __u8 argc; + __le32 indata_len; + } __attribute__ ((packed)) cls; + struct { + __le64 cookie, count; + } __attribute__ ((packed)) pgls; + }; + __le32 payload_len; +} __attribute__ ((packed)); + +/* + * osd request message header. each request may include multiple + * ceph_osd_op object operations. + */ +struct ceph_osd_request_head { + __le32 client_inc; /* client incarnation */ + struct ceph_object_layout layout; /* pgid */ + __le32 osdmap_epoch; /* client's osdmap epoch */ + + __le32 flags; + + struct ceph_timespec mtime; /* for mutations only */ + struct ceph_eversion reassert_version; /* if we are replaying op */ + + __le32 object_len; /* length of object name */ + + __le64 snapid; /* snapid to read */ + __le64 snap_seq; /* writer's snap context */ + __le32 num_snaps; + + __le16 num_ops; + struct ceph_osd_op ops[]; /* followed by ops[], obj, ticket, snaps */ +} __attribute__ ((packed)); + +struct ceph_osd_reply_head { + __le32 client_inc; /* client incarnation */ + __le32 flags; + struct ceph_object_layout layout; + __le32 osdmap_epoch; + struct ceph_eversion reassert_version; /* for replaying uncommitted */ + + __le32 result; /* result code */ + + __le32 object_len; /* length of object name */ + __le32 num_ops; + struct ceph_osd_op ops[0]; /* ops[], object */ +} __attribute__ ((packed)); + + +#endif diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c new file mode 100644 index 00000000000..bf2a5f3846a --- /dev/null +++ b/fs/ceph/snap.c @@ -0,0 +1,904 @@ +#include "ceph_debug.h" + +#include <linux/sort.h> + +#include "super.h" +#include "decode.h" + +/* + * Snapshots in ceph are driven in large part by cooperation from the + * client. In contrast to local file systems or file servers that + * implement snapshots at a single point in the system, ceph's + * distributed access to storage requires clients to help decide + * whether a write logically occurs before or after a recently created + * snapshot. + * + * This provides a perfect instantanous client-wide snapshot. Between + * clients, however, snapshots may appear to be applied at slightly + * different points in time, depending on delays in delivering the + * snapshot notification. + * + * Snapshots are _not_ file system-wide. Instead, each snapshot + * applies to the subdirectory nested beneath some directory. This + * effectively divides the hierarchy into multiple "realms," where all + * of the files contained by each realm share the same set of + * snapshots. An individual realm's snap set contains snapshots + * explicitly created on that realm, as well as any snaps in its + * parent's snap set _after_ the point at which the parent became it's + * parent (due to, say, a rename). Similarly, snaps from prior parents + * during the time intervals during which they were the parent are included. + * + * The client is spared most of this detail, fortunately... it must only + * maintains a hierarchy of realms reflecting the current parent/child + * realm relationship, and for each realm has an explicit list of snaps + * inherited from prior parents. + * + * A snap_realm struct is maintained for realms containing every inode + * with an open cap in the system. (The needed snap realm information is + * provided by the MDS whenever a cap is issued, i.e., on open.) A 'seq' + * version number is used to ensure that as realm parameters change (new + * snapshot, new parent, etc.) the client's realm hierarchy is updated. + * + * The realm hierarchy drives the generation of a 'snap context' for each + * realm, which simply lists the resulting set of snaps for the realm. This + * is attached to any writes sent to OSDs. + */ +/* + * Unfortunately error handling is a bit mixed here. If we get a snap + * update, but don't have enough memory to update our realm hierarchy, + * it's not clear what we can do about it (besides complaining to the + * console). + */ + + +/* + * increase ref count for the realm + * + * caller must hold snap_rwsem for write. + */ +void ceph_get_snap_realm(struct ceph_mds_client *mdsc, + struct ceph_snap_realm *realm) +{ + dout("get_realm %p %d -> %d\n", realm, + atomic_read(&realm->nref), atomic_read(&realm->nref)+1); + /* + * since we _only_ increment realm refs or empty the empty + * list with snap_rwsem held, adjusting the empty list here is + * safe. we do need to protect against concurrent empty list + * additions, however. + */ + if (atomic_read(&realm->nref) == 0) { + spin_lock(&mdsc->snap_empty_lock); + list_del_init(&realm->empty_item); + spin_unlock(&mdsc->snap_empty_lock); + } + + atomic_inc(&realm->nref); +} + +static void __insert_snap_realm(struct rb_root *root, + struct ceph_snap_realm *new) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct ceph_snap_realm *r = NULL; + + while (*p) { + parent = *p; + r = rb_entry(parent, struct ceph_snap_realm, node); + if (new->ino < r->ino) + p = &(*p)->rb_left; + else if (new->ino > r->ino) + p = &(*p)->rb_right; + else + BUG(); + } + + rb_link_node(&new->node, parent, p); + rb_insert_color(&new->node, root); +} + +/* + * create and get the realm rooted at @ino and bump its ref count. + * + * caller must hold snap_rwsem for write. + */ +static struct ceph_snap_realm *ceph_create_snap_realm( + struct ceph_mds_client *mdsc, + u64 ino) +{ + struct ceph_snap_realm *realm; + + realm = kzalloc(sizeof(*realm), GFP_NOFS); + if (!realm) + return ERR_PTR(-ENOMEM); + + atomic_set(&realm->nref, 0); /* tree does not take a ref */ + realm->ino = ino; + INIT_LIST_HEAD(&realm->children); + INIT_LIST_HEAD(&realm->child_item); + INIT_LIST_HEAD(&realm->empty_item); + INIT_LIST_HEAD(&realm->inodes_with_caps); + spin_lock_init(&realm->inodes_with_caps_lock); + __insert_snap_realm(&mdsc->snap_realms, realm); + dout("create_snap_realm %llx %p\n", realm->ino, realm); + return realm; +} + +/* + * lookup the realm rooted at @ino. + * + * caller must hold snap_rwsem for write. + */ +struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc, + u64 ino) +{ + struct rb_node *n = mdsc->snap_realms.rb_node; + struct ceph_snap_realm *r; + + while (n) { + r = rb_entry(n, struct ceph_snap_realm, node); + if (ino < r->ino) + n = n->rb_left; + else if (ino > r->ino) + n = n->rb_right; + else { + dout("lookup_snap_realm %llx %p\n", r->ino, r); + return r; + } + } + return NULL; +} + +static void __put_snap_realm(struct ceph_mds_client *mdsc, + struct ceph_snap_realm *realm); + +/* + * called with snap_rwsem (write) + */ +static void __destroy_snap_realm(struct ceph_mds_client *mdsc, + struct ceph_snap_realm *realm) +{ + dout("__destroy_snap_realm %p %llx\n", realm, realm->ino); + + rb_erase(&realm->node, &mdsc->snap_realms); + + if (realm->parent) { + list_del_init(&realm->child_item); + __put_snap_realm(mdsc, realm->parent); + } + + kfree(realm->prior_parent_snaps); + kfree(realm->snaps); + ceph_put_snap_context(realm->cached_context); + kfree(realm); +} + +/* + * caller holds snap_rwsem (write) + */ +static void __put_snap_realm(struct ceph_mds_client *mdsc, + struct ceph_snap_realm *realm) +{ + dout("__put_snap_realm %llx %p %d -> %d\n", realm->ino, realm, + atomic_read(&realm->nref), atomic_read(&realm->nref)-1); + if (atomic_dec_and_test(&realm->nref)) + __destroy_snap_realm(mdsc, realm); +} + +/* + * caller needn't hold any locks + */ +void ceph_put_snap_realm(struct ceph_mds_client *mdsc, + struct ceph_snap_realm *realm) +{ + dout("put_snap_realm %llx %p %d -> %d\n", realm->ino, realm, + atomic_read(&realm->nref), atomic_read(&realm->nref)-1); + if (!atomic_dec_and_test(&realm->nref)) + return; + + if (down_write_trylock(&mdsc->snap_rwsem)) { + __destroy_snap_realm(mdsc, realm); + up_write(&mdsc->snap_rwsem); + } else { + spin_lock(&mdsc->snap_empty_lock); + list_add(&mdsc->snap_empty, &realm->empty_item); + spin_unlock(&mdsc->snap_empty_lock); + } +} + +/* + * Clean up any realms whose ref counts have dropped to zero. Note + * that this does not include realms who were created but not yet + * used. + * + * Called under snap_rwsem (write) + */ +static void __cleanup_empty_realms(struct ceph_mds_client *mdsc) +{ + struct ceph_snap_realm *realm; + + spin_lock(&mdsc->snap_empty_lock); + while (!list_empty(&mdsc->snap_empty)) { + realm = list_first_entry(&mdsc->snap_empty, + struct ceph_snap_realm, empty_item); + list_del(&realm->empty_item); + spin_unlock(&mdsc->snap_empty_lock); + __destroy_snap_realm(mdsc, realm); + spin_lock(&mdsc->snap_empty_lock); + } + spin_unlock(&mdsc->snap_empty_lock); +} + +void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc) +{ + down_write(&mdsc->snap_rwsem); + __cleanup_empty_realms(mdsc); + up_write(&mdsc->snap_rwsem); +} + +/* + * adjust the parent realm of a given @realm. adjust child list, and parent + * pointers, and ref counts appropriately. + * + * return true if parent was changed, 0 if unchanged, <0 on error. + * + * caller must hold snap_rwsem for write. + */ +static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc, + struct ceph_snap_realm *realm, + u64 parentino) +{ + struct ceph_snap_realm *parent; + + if (realm->parent_ino == parentino) + return 0; + + parent = ceph_lookup_snap_realm(mdsc, parentino); + if (!parent) { + parent = ceph_create_snap_realm(mdsc, parentino); + if (IS_ERR(parent)) + return PTR_ERR(parent); + } + dout("adjust_snap_realm_parent %llx %p: %llx %p -> %llx %p\n", + realm->ino, realm, realm->parent_ino, realm->parent, + parentino, parent); + if (realm->parent) { + list_del_init(&realm->child_item); + ceph_put_snap_realm(mdsc, realm->parent); + } + realm->parent_ino = parentino; + realm->parent = parent; + ceph_get_snap_realm(mdsc, parent); + list_add(&realm->child_item, &parent->children); + return 1; +} + + +static int cmpu64_rev(const void *a, const void *b) +{ + if (*(u64 *)a < *(u64 *)b) + return 1; + if (*(u64 *)a > *(u64 *)b) + return -1; + return 0; +} + +/* + * build the snap context for a given realm. + */ +static int build_snap_context(struct ceph_snap_realm *realm) +{ + struct ceph_snap_realm *parent = realm->parent; + struct ceph_snap_context *snapc; + int err = 0; + int i; + int num = realm->num_prior_parent_snaps + realm->num_snaps; + + /* + * build parent context, if it hasn't been built. + * conservatively estimate that all parent snaps might be + * included by us. + */ + if (parent) { + if (!parent->cached_context) { + err = build_snap_context(parent); + if (err) + goto fail; + } + num += parent->cached_context->num_snaps; + } + + /* do i actually need to update? not if my context seq + matches realm seq, and my parents' does to. (this works + because we rebuild_snap_realms() works _downward_ in + hierarchy after each update.) */ + if (realm->cached_context && + realm->cached_context->seq <= realm->seq && + (!parent || + realm->cached_context->seq <= parent->cached_context->seq)) { + dout("build_snap_context %llx %p: %p seq %lld (%d snaps)" + " (unchanged)\n", + realm->ino, realm, realm->cached_context, + realm->cached_context->seq, + realm->cached_context->num_snaps); + return 0; + } + + /* alloc new snap context */ + err = -ENOMEM; + if (num > ULONG_MAX / sizeof(u64) - sizeof(*snapc)) + goto fail; + snapc = kzalloc(sizeof(*snapc) + num*sizeof(u64), GFP_NOFS); + if (!snapc) + goto fail; + atomic_set(&snapc->nref, 1); + + /* build (reverse sorted) snap vector */ + num = 0; + snapc->seq = realm->seq; + if (parent) { + /* include any of parent's snaps occuring _after_ my + parent became my parent */ + for (i = 0; i < parent->cached_context->num_snaps; i++) + if (parent->cached_context->snaps[i] >= + realm->parent_since) + snapc->snaps[num++] = + parent->cached_context->snaps[i]; + if (parent->cached_context->seq > snapc->seq) + snapc->seq = parent->cached_context->seq; + } + memcpy(snapc->snaps + num, realm->snaps, + sizeof(u64)*realm->num_snaps); + num += realm->num_snaps; + memcpy(snapc->snaps + num, realm->prior_parent_snaps, + sizeof(u64)*realm->num_prior_parent_snaps); + num += realm->num_prior_parent_snaps; + + sort(snapc->snaps, num, sizeof(u64), cmpu64_rev, NULL); + snapc->num_snaps = num; + dout("build_snap_context %llx %p: %p seq %lld (%d snaps)\n", + realm->ino, realm, snapc, snapc->seq, snapc->num_snaps); + + if (realm->cached_context) + ceph_put_snap_context(realm->cached_context); + realm->cached_context = snapc; + return 0; + +fail: + /* + * if we fail, clear old (incorrect) cached_context... hopefully + * we'll have better luck building it later + */ + if (realm->cached_context) { + ceph_put_snap_context(realm->cached_context); + realm->cached_context = NULL; + } + pr_err("build_snap_context %llx %p fail %d\n", realm->ino, + realm, err); + return err; +} + +/* + * rebuild snap context for the given realm and all of its children. + */ +static void rebuild_snap_realms(struct ceph_snap_realm *realm) +{ + struct ceph_snap_realm *child; + + dout("rebuild_snap_realms %llx %p\n", realm->ino, realm); + build_snap_context(realm); + + list_for_each_entry(child, &realm->children, child_item) + rebuild_snap_realms(child); +} + + +/* + * helper to allocate and decode an array of snapids. free prior + * instance, if any. + */ +static int dup_array(u64 **dst, __le64 *src, int num) +{ + int i; + + kfree(*dst); + if (num) { + *dst = kcalloc(num, sizeof(u64), GFP_NOFS); + if (!*dst) + return -ENOMEM; + for (i = 0; i < num; i++) + (*dst)[i] = get_unaligned_le64(src + i); + } else { + *dst = NULL; + } + return 0; +} + + +/* + * When a snapshot is applied, the size/mtime inode metadata is queued + * in a ceph_cap_snap (one for each snapshot) until writeback + * completes and the metadata can be flushed back to the MDS. + * + * However, if a (sync) write is currently in-progress when we apply + * the snapshot, we have to wait until the write succeeds or fails + * (and a final size/mtime is known). In this case the + * cap_snap->writing = 1, and is said to be "pending." When the write + * finishes, we __ceph_finish_cap_snap(). + * + * Caller must hold snap_rwsem for read (i.e., the realm topology won't + * change). + */ +void ceph_queue_cap_snap(struct ceph_inode_info *ci, + struct ceph_snap_context *snapc) +{ + struct inode *inode = &ci->vfs_inode; + struct ceph_cap_snap *capsnap; + int used; + + capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS); + if (!capsnap) { + pr_err("ENOMEM allocating ceph_cap_snap on %p\n", inode); + return; + } + + spin_lock(&inode->i_lock); + used = __ceph_caps_used(ci); + if (__ceph_have_pending_cap_snap(ci)) { + /* there is no point in queuing multiple "pending" cap_snaps, + as no new writes are allowed to start when pending, so any + writes in progress now were started before the previous + cap_snap. lucky us. */ + dout("queue_cap_snap %p snapc %p seq %llu used %d" + " already pending\n", inode, snapc, snapc->seq, used); + kfree(capsnap); + } else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR)) { + igrab(inode); + + atomic_set(&capsnap->nref, 1); + capsnap->ci = ci; + INIT_LIST_HEAD(&capsnap->ci_item); + INIT_LIST_HEAD(&capsnap->flushing_item); + + capsnap->follows = snapc->seq - 1; + capsnap->context = ceph_get_snap_context(snapc); + capsnap->issued = __ceph_caps_issued(ci, NULL); + capsnap->dirty = __ceph_caps_dirty(ci); + + capsnap->mode = inode->i_mode; + capsnap->uid = inode->i_uid; + capsnap->gid = inode->i_gid; + + /* fixme? */ + capsnap->xattr_blob = NULL; + capsnap->xattr_len = 0; + + /* dirty page count moved from _head to this cap_snap; + all subsequent writes page dirties occur _after_ this + snapshot. */ + capsnap->dirty_pages = ci->i_wrbuffer_ref_head; + ci->i_wrbuffer_ref_head = 0; + ceph_put_snap_context(ci->i_head_snapc); + ci->i_head_snapc = NULL; + list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps); + + if (used & CEPH_CAP_FILE_WR) { + dout("queue_cap_snap %p cap_snap %p snapc %p" + " seq %llu used WR, now pending\n", inode, + capsnap, snapc, snapc->seq); + capsnap->writing = 1; + } else { + /* note mtime, size NOW. */ + __ceph_finish_cap_snap(ci, capsnap); + } + } else { + dout("queue_cap_snap %p nothing dirty|writing\n", inode); + kfree(capsnap); + } + + spin_unlock(&inode->i_lock); +} + +/* + * Finalize the size, mtime for a cap_snap.. that is, settle on final values + * to be used for the snapshot, to be flushed back to the mds. + * + * If capsnap can now be flushed, add to snap_flush list, and return 1. + * + * Caller must hold i_lock. + */ +int __ceph_finish_cap_snap(struct ceph_inode_info *ci, + struct ceph_cap_snap *capsnap) +{ + struct inode *inode = &ci->vfs_inode; + struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc; + + BUG_ON(capsnap->writing); + capsnap->size = inode->i_size; + capsnap->mtime = inode->i_mtime; + capsnap->atime = inode->i_atime; + capsnap->ctime = inode->i_ctime; + capsnap->time_warp_seq = ci->i_time_warp_seq; + if (capsnap->dirty_pages) { + dout("finish_cap_snap %p cap_snap %p snapc %p %llu s=%llu " + "still has %d dirty pages\n", inode, capsnap, + capsnap->context, capsnap->context->seq, + capsnap->size, capsnap->dirty_pages); + return 0; + } + dout("finish_cap_snap %p cap_snap %p snapc %p %llu s=%llu clean\n", + inode, capsnap, capsnap->context, + capsnap->context->seq, capsnap->size); + + spin_lock(&mdsc->snap_flush_lock); + list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list); + spin_unlock(&mdsc->snap_flush_lock); + return 1; /* caller may want to ceph_flush_snaps */ +} + + +/* + * Parse and apply a snapblob "snap trace" from the MDS. This specifies + * the snap realm parameters from a given realm and all of its ancestors, + * up to the root. + * + * Caller must hold snap_rwsem for write. + */ +int ceph_update_snap_trace(struct ceph_mds_client *mdsc, + void *p, void *e, bool deletion) +{ + struct ceph_mds_snap_realm *ri; /* encoded */ + __le64 *snaps; /* encoded */ + __le64 *prior_parent_snaps; /* encoded */ + struct ceph_snap_realm *realm; + int invalidate = 0; + int err = -ENOMEM; + + dout("update_snap_trace deletion=%d\n", deletion); +more: + ceph_decode_need(&p, e, sizeof(*ri), bad); + ri = p; + p += sizeof(*ri); + ceph_decode_need(&p, e, sizeof(u64)*(le32_to_cpu(ri->num_snaps) + + le32_to_cpu(ri->num_prior_parent_snaps)), bad); + snaps = p; + p += sizeof(u64) * le32_to_cpu(ri->num_snaps); + prior_parent_snaps = p; + p += sizeof(u64) * le32_to_cpu(ri->num_prior_parent_snaps); + + realm = ceph_lookup_snap_realm(mdsc, le64_to_cpu(ri->ino)); + if (!realm) { + realm = ceph_create_snap_realm(mdsc, le64_to_cpu(ri->ino)); + if (IS_ERR(realm)) { + err = PTR_ERR(realm); + goto fail; + } + } + + if (le64_to_cpu(ri->seq) > realm->seq) { + dout("update_snap_trace updating %llx %p %lld -> %lld\n", + realm->ino, realm, realm->seq, le64_to_cpu(ri->seq)); + /* + * if the realm seq has changed, queue a cap_snap for every + * inode with open caps. we do this _before_ we update + * the realm info so that we prepare for writeback under the + * _previous_ snap context. + * + * ...unless it's a snap deletion! + */ + if (!deletion) { + struct ceph_inode_info *ci; + struct inode *lastinode = NULL; + + spin_lock(&realm->inodes_with_caps_lock); + list_for_each_entry(ci, &realm->inodes_with_caps, + i_snap_realm_item) { + struct inode *inode = igrab(&ci->vfs_inode); + if (!inode) + continue; + spin_unlock(&realm->inodes_with_caps_lock); + if (lastinode) + iput(lastinode); + lastinode = inode; + ceph_queue_cap_snap(ci, realm->cached_context); + spin_lock(&realm->inodes_with_caps_lock); + } + spin_unlock(&realm->inodes_with_caps_lock); + if (lastinode) + iput(lastinode); + dout("update_snap_trace cap_snaps queued\n"); + } + + } else { + dout("update_snap_trace %llx %p seq %lld unchanged\n", + realm->ino, realm, realm->seq); + } + + /* ensure the parent is correct */ + err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent)); + if (err < 0) + goto fail; + invalidate += err; + + if (le64_to_cpu(ri->seq) > realm->seq) { + /* update realm parameters, snap lists */ + realm->seq = le64_to_cpu(ri->seq); + realm->created = le64_to_cpu(ri->created); + realm->parent_since = le64_to_cpu(ri->parent_since); + + realm->num_snaps = le32_to_cpu(ri->num_snaps); + err = dup_array(&realm->snaps, snaps, realm->num_snaps); + if (err < 0) + goto fail; + + realm->num_prior_parent_snaps = + le32_to_cpu(ri->num_prior_parent_snaps); + err = dup_array(&realm->prior_parent_snaps, prior_parent_snaps, + realm->num_prior_parent_snaps); + if (err < 0) + goto fail; + + invalidate = 1; + } else if (!realm->cached_context) { + invalidate = 1; + } + + dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino, + realm, invalidate, p, e); + + if (p < e) + goto more; + + /* invalidate when we reach the _end_ (root) of the trace */ + if (invalidate) + rebuild_snap_realms(realm); + + __cleanup_empty_realms(mdsc); + return 0; + +bad: + err = -EINVAL; +fail: + pr_err("update_snap_trace error %d\n", err); + return err; +} + + +/* + * Send any cap_snaps that are queued for flush. Try to carry + * s_mutex across multiple snap flushes to avoid locking overhead. + * + * Caller holds no locks. + */ +static void flush_snaps(struct ceph_mds_client *mdsc) +{ + struct ceph_inode_info *ci; + struct inode *inode; + struct ceph_mds_session *session = NULL; + + dout("flush_snaps\n"); + spin_lock(&mdsc->snap_flush_lock); + while (!list_empty(&mdsc->snap_flush_list)) { + ci = list_first_entry(&mdsc->snap_flush_list, + struct ceph_inode_info, i_snap_flush_item); + inode = &ci->vfs_inode; + igrab(inode); + spin_unlock(&mdsc->snap_flush_lock); + spin_lock(&inode->i_lock); + __ceph_flush_snaps(ci, &session); + spin_unlock(&inode->i_lock); + iput(inode); + spin_lock(&mdsc->snap_flush_lock); + } + spin_unlock(&mdsc->snap_flush_lock); + + if (session) { + mutex_unlock(&session->s_mutex); + ceph_put_mds_session(session); + } + dout("flush_snaps done\n"); +} + + +/* + * Handle a snap notification from the MDS. + * + * This can take two basic forms: the simplest is just a snap creation + * or deletion notification on an existing realm. This should update the + * realm and its children. + * + * The more difficult case is realm creation, due to snap creation at a + * new point in the file hierarchy, or due to a rename that moves a file or + * directory into another realm. + */ +void ceph_handle_snap(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, + struct ceph_msg *msg) +{ + struct super_block *sb = mdsc->client->sb; + int mds = session->s_mds; + u64 split; + int op; + int trace_len; + struct ceph_snap_realm *realm = NULL; + void *p = msg->front.iov_base; + void *e = p + msg->front.iov_len; + struct ceph_mds_snap_head *h; + int num_split_inos, num_split_realms; + __le64 *split_inos = NULL, *split_realms = NULL; + int i; + int locked_rwsem = 0; + + /* decode */ + if (msg->front.iov_len < sizeof(*h)) + goto bad; + h = p; + op = le32_to_cpu(h->op); + split = le64_to_cpu(h->split); /* non-zero if we are splitting an + * existing realm */ + num_split_inos = le32_to_cpu(h->num_split_inos); + num_split_realms = le32_to_cpu(h->num_split_realms); + trace_len = le32_to_cpu(h->trace_len); + p += sizeof(*h); + + dout("handle_snap from mds%d op %s split %llx tracelen %d\n", mds, + ceph_snap_op_name(op), split, trace_len); + + mutex_lock(&session->s_mutex); + session->s_seq++; + mutex_unlock(&session->s_mutex); + + down_write(&mdsc->snap_rwsem); + locked_rwsem = 1; + + if (op == CEPH_SNAP_OP_SPLIT) { + struct ceph_mds_snap_realm *ri; + + /* + * A "split" breaks part of an existing realm off into + * a new realm. The MDS provides a list of inodes + * (with caps) and child realms that belong to the new + * child. + */ + split_inos = p; + p += sizeof(u64) * num_split_inos; + split_realms = p; + p += sizeof(u64) * num_split_realms; + ceph_decode_need(&p, e, sizeof(*ri), bad); + /* we will peek at realm info here, but will _not_ + * advance p, as the realm update will occur below in + * ceph_update_snap_trace. */ + ri = p; + + realm = ceph_lookup_snap_realm(mdsc, split); + if (!realm) { + realm = ceph_create_snap_realm(mdsc, split); + if (IS_ERR(realm)) + goto out; + } + ceph_get_snap_realm(mdsc, realm); + + dout("splitting snap_realm %llx %p\n", realm->ino, realm); + for (i = 0; i < num_split_inos; i++) { + struct ceph_vino vino = { + .ino = le64_to_cpu(split_inos[i]), + .snap = CEPH_NOSNAP, + }; + struct inode *inode = ceph_find_inode(sb, vino); + struct ceph_inode_info *ci; + + if (!inode) + continue; + ci = ceph_inode(inode); + + spin_lock(&inode->i_lock); + if (!ci->i_snap_realm) + goto skip_inode; + /* + * If this inode belongs to a realm that was + * created after our new realm, we experienced + * a race (due to another split notifications + * arriving from a different MDS). So skip + * this inode. + */ + if (ci->i_snap_realm->created > + le64_to_cpu(ri->created)) { + dout(" leaving %p in newer realm %llx %p\n", + inode, ci->i_snap_realm->ino, + ci->i_snap_realm); + goto skip_inode; + } + dout(" will move %p to split realm %llx %p\n", + inode, realm->ino, realm); + /* + * Remove the inode from the realm's inode + * list, but don't add it to the new realm + * yet. We don't want the cap_snap to be + * queued (again) by ceph_update_snap_trace() + * below. Queue it _now_, under the old context. + */ + list_del_init(&ci->i_snap_realm_item); + spin_unlock(&inode->i_lock); + + ceph_queue_cap_snap(ci, + ci->i_snap_realm->cached_context); + + iput(inode); + continue; + +skip_inode: + spin_unlock(&inode->i_lock); + iput(inode); + } + + /* we may have taken some of the old realm's children. */ + for (i = 0; i < num_split_realms; i++) { + struct ceph_snap_realm *child = + ceph_lookup_snap_realm(mdsc, + le64_to_cpu(split_realms[i])); + if (!child) + continue; + adjust_snap_realm_parent(mdsc, child, realm->ino); + } + } + + /* + * update using the provided snap trace. if we are deleting a + * snap, we can avoid queueing cap_snaps. + */ + ceph_update_snap_trace(mdsc, p, e, + op == CEPH_SNAP_OP_DESTROY); + + if (op == CEPH_SNAP_OP_SPLIT) { + /* + * ok, _now_ add the inodes into the new realm. + */ + for (i = 0; i < num_split_inos; i++) { + struct ceph_vino vino = { + .ino = le64_to_cpu(split_inos[i]), + .snap = CEPH_NOSNAP, + }; + struct inode *inode = ceph_find_inode(sb, vino); + struct ceph_inode_info *ci; + + if (!inode) + continue; + ci = ceph_inode(inode); + spin_lock(&inode->i_lock); + if (!ci->i_snap_realm) + goto split_skip_inode; + ceph_put_snap_realm(mdsc, ci->i_snap_realm); + spin_lock(&realm->inodes_with_caps_lock); + list_add(&ci->i_snap_realm_item, + &realm->inodes_with_caps); + ci->i_snap_realm = realm; + spin_unlock(&realm->inodes_with_caps_lock); + ceph_get_snap_realm(mdsc, realm); +split_skip_inode: + spin_unlock(&inode->i_lock); + iput(inode); + } + + /* we took a reference when we created the realm, above */ + ceph_put_snap_realm(mdsc, realm); + } + + __cleanup_empty_realms(mdsc); + + up_write(&mdsc->snap_rwsem); + + flush_snaps(mdsc); + return; + +bad: + pr_err("corrupt snap message from mds%d\n", mds); + ceph_msg_dump(msg); +out: + if (locked_rwsem) + up_write(&mdsc->snap_rwsem); + return; +} + + + diff --git a/fs/ceph/super.c b/fs/ceph/super.c new file mode 100644 index 00000000000..4290a6e860b --- /dev/null +++ b/fs/ceph/super.c @@ -0,0 +1,1030 @@ + +#include "ceph_debug.h" + +#include <linux/backing-dev.h> +#include <linux/fs.h> +#include <linux/inet.h> +#include <linux/in6.h> +#include <linux/module.h> +#include <linux/mount.h> +#include <linux/parser.h> +#include <linux/rwsem.h> +#include <linux/sched.h> +#include <linux/seq_file.h> +#include <linux/statfs.h> +#include <linux/string.h> +#include <linux/version.h> +#include <linux/vmalloc.h> + +#include "decode.h" +#include "super.h" +#include "mon_client.h" +#include "auth.h" + +/* + * Ceph superblock operations + * + * Handle the basics of mounting, unmounting. + */ + + +/* + * find filename portion of a path (/foo/bar/baz -> baz) + */ +const char *ceph_file_part(const char *s, int len) +{ + const char *e = s + len; + + while (e != s && *(e-1) != '/') + e--; + return e; +} + + +/* + * super ops + */ +static void ceph_put_super(struct super_block *s) +{ + struct ceph_client *cl = ceph_client(s); + + dout("put_super\n"); + ceph_mdsc_close_sessions(&cl->mdsc); + return; +} + +static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct ceph_client *client = ceph_inode_to_client(dentry->d_inode); + struct ceph_monmap *monmap = client->monc.monmap; + struct ceph_statfs st; + u64 fsid; + int err; + + dout("statfs\n"); + err = ceph_monc_do_statfs(&client->monc, &st); + if (err < 0) + return err; + + /* fill in kstatfs */ + buf->f_type = CEPH_SUPER_MAGIC; /* ?? */ + + /* + * express utilization in terms of large blocks to avoid + * overflow on 32-bit machines. + */ + buf->f_bsize = 1 << CEPH_BLOCK_SHIFT; + buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10); + buf->f_bfree = (le64_to_cpu(st.kb) - le64_to_cpu(st.kb_used)) >> + (CEPH_BLOCK_SHIFT-10); + buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10); + + buf->f_files = le64_to_cpu(st.num_objects); + buf->f_ffree = -1; + buf->f_namelen = PATH_MAX; + buf->f_frsize = PAGE_CACHE_SIZE; + + /* leave fsid little-endian, regardless of host endianness */ + fsid = *(u64 *)(&monmap->fsid) ^ *((u64 *)&monmap->fsid + 1); + buf->f_fsid.val[0] = fsid & 0xffffffff; + buf->f_fsid.val[1] = fsid >> 32; + + return 0; +} + + +static int ceph_syncfs(struct super_block *sb, int wait) +{ + dout("sync_fs %d\n", wait); + ceph_osdc_sync(&ceph_client(sb)->osdc); + ceph_mdsc_sync(&ceph_client(sb)->mdsc); + dout("sync_fs %d done\n", wait); + return 0; +} + + +/** + * ceph_show_options - Show mount options in /proc/mounts + * @m: seq_file to write to + * @mnt: mount descriptor + */ +static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt) +{ + struct ceph_client *client = ceph_sb_to_client(mnt->mnt_sb); + struct ceph_mount_args *args = client->mount_args; + + if (args->flags & CEPH_OPT_FSID) + seq_printf(m, ",fsidmajor=%llu,fsidminor%llu", + le64_to_cpu(*(__le64 *)&args->fsid.fsid[0]), + le64_to_cpu(*(__le64 *)&args->fsid.fsid[8])); + if (args->flags & CEPH_OPT_NOSHARE) + seq_puts(m, ",noshare"); + if (args->flags & CEPH_OPT_DIRSTAT) + seq_puts(m, ",dirstat"); + if ((args->flags & CEPH_OPT_RBYTES) == 0) + seq_puts(m, ",norbytes"); + if (args->flags & CEPH_OPT_NOCRC) + seq_puts(m, ",nocrc"); + if (args->flags & CEPH_OPT_NOASYNCREADDIR) + seq_puts(m, ",noasyncreaddir"); + if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT)) + seq_printf(m, ",snapdirname=%s", args->snapdir_name); + if (args->name) + seq_printf(m, ",name=%s", args->name); + if (args->secret) + seq_puts(m, ",secret=<hidden>"); + return 0; +} + +/* + * caches + */ +struct kmem_cache *ceph_inode_cachep; +struct kmem_cache *ceph_cap_cachep; +struct kmem_cache *ceph_dentry_cachep; +struct kmem_cache *ceph_file_cachep; + +static void ceph_inode_init_once(void *foo) +{ + struct ceph_inode_info *ci = foo; + inode_init_once(&ci->vfs_inode); +} + +static int default_congestion_kb(void) +{ + int congestion_kb; + + /* + * Copied from NFS + * + * congestion size, scale with available memory. + * + * 64MB: 8192k + * 128MB: 11585k + * 256MB: 16384k + * 512MB: 23170k + * 1GB: 32768k + * 2GB: 46340k + * 4GB: 65536k + * 8GB: 92681k + * 16GB: 131072k + * + * This allows larger machines to have larger/more transfers. + * Limit the default to 256M + */ + congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10); + if (congestion_kb > 256*1024) + congestion_kb = 256*1024; + + return congestion_kb; +} + +static int __init init_caches(void) +{ + ceph_inode_cachep = kmem_cache_create("ceph_inode_info", + sizeof(struct ceph_inode_info), + __alignof__(struct ceph_inode_info), + (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD), + ceph_inode_init_once); + if (ceph_inode_cachep == NULL) + return -ENOMEM; + + ceph_cap_cachep = KMEM_CACHE(ceph_cap, + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); + if (ceph_cap_cachep == NULL) + goto bad_cap; + + ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info, + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); + if (ceph_dentry_cachep == NULL) + goto bad_dentry; + + ceph_file_cachep = KMEM_CACHE(ceph_file_info, + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); + if (ceph_file_cachep == NULL) + goto bad_file; + + return 0; + +bad_file: + kmem_cache_destroy(ceph_dentry_cachep); +bad_dentry: + kmem_cache_destroy(ceph_cap_cachep); +bad_cap: + kmem_cache_destroy(ceph_inode_cachep); + return -ENOMEM; +} + +static void destroy_caches(void) +{ + kmem_cache_destroy(ceph_inode_cachep); + kmem_cache_destroy(ceph_cap_cachep); + kmem_cache_destroy(ceph_dentry_cachep); + kmem_cache_destroy(ceph_file_cachep); +} + + +/* + * ceph_umount_begin - initiate forced umount. Tear down down the + * mount, skipping steps that may hang while waiting for server(s). + */ +static void ceph_umount_begin(struct super_block *sb) +{ + struct ceph_client *client = ceph_sb_to_client(sb); + + dout("ceph_umount_begin - starting forced umount\n"); + if (!client) + return; + client->mount_state = CEPH_MOUNT_SHUTDOWN; + return; +} + +static const struct super_operations ceph_super_ops = { + .alloc_inode = ceph_alloc_inode, + .destroy_inode = ceph_destroy_inode, + .write_inode = ceph_write_inode, + .sync_fs = ceph_syncfs, + .put_super = ceph_put_super, + .show_options = ceph_show_options, + .statfs = ceph_statfs, + .umount_begin = ceph_umount_begin, +}; + + +const char *ceph_msg_type_name(int type) +{ + switch (type) { + case CEPH_MSG_SHUTDOWN: return "shutdown"; + case CEPH_MSG_PING: return "ping"; + case CEPH_MSG_AUTH: return "auth"; + case CEPH_MSG_AUTH_REPLY: return "auth_reply"; + case CEPH_MSG_MON_MAP: return "mon_map"; + case CEPH_MSG_MON_GET_MAP: return "mon_get_map"; + case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe"; + case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack"; + case CEPH_MSG_STATFS: return "statfs"; + case CEPH_MSG_STATFS_REPLY: return "statfs_reply"; + case CEPH_MSG_MDS_MAP: return "mds_map"; + case CEPH_MSG_CLIENT_SESSION: return "client_session"; + case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect"; + case CEPH_MSG_CLIENT_REQUEST: return "client_request"; + case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward"; + case CEPH_MSG_CLIENT_REPLY: return "client_reply"; + case CEPH_MSG_CLIENT_CAPS: return "client_caps"; + case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release"; + case CEPH_MSG_CLIENT_SNAP: return "client_snap"; + case CEPH_MSG_CLIENT_LEASE: return "client_lease"; + case CEPH_MSG_OSD_MAP: return "osd_map"; + case CEPH_MSG_OSD_OP: return "osd_op"; + case CEPH_MSG_OSD_OPREPLY: return "osd_opreply"; + default: return "unknown"; + } +} + + +/* + * mount options + */ +enum { + Opt_fsidmajor, + Opt_fsidminor, + Opt_monport, + Opt_wsize, + Opt_rsize, + Opt_osdtimeout, + Opt_osdkeepalivetimeout, + Opt_mount_timeout, + Opt_osd_idle_ttl, + Opt_caps_wanted_delay_min, + Opt_caps_wanted_delay_max, + Opt_readdir_max_entries, + Opt_congestion_kb, + Opt_last_int, + /* int args above */ + Opt_snapdirname, + Opt_name, + Opt_secret, + Opt_last_string, + /* string args above */ + Opt_ip, + Opt_noshare, + Opt_dirstat, + Opt_nodirstat, + Opt_rbytes, + Opt_norbytes, + Opt_nocrc, + Opt_noasyncreaddir, +}; + +static match_table_t arg_tokens = { + {Opt_fsidmajor, "fsidmajor=%ld"}, + {Opt_fsidminor, "fsidminor=%ld"}, + {Opt_monport, "monport=%d"}, + {Opt_wsize, "wsize=%d"}, + {Opt_rsize, "rsize=%d"}, + {Opt_osdtimeout, "osdtimeout=%d"}, + {Opt_osdkeepalivetimeout, "osdkeepalive=%d"}, + {Opt_mount_timeout, "mount_timeout=%d"}, + {Opt_osd_idle_ttl, "osd_idle_ttl=%d"}, + {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"}, + {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"}, + {Opt_readdir_max_entries, "readdir_max_entries=%d"}, + {Opt_congestion_kb, "write_congestion_kb=%d"}, + /* int args above */ + {Opt_snapdirname, "snapdirname=%s"}, + {Opt_name, "name=%s"}, + {Opt_secret, "secret=%s"}, + /* string args above */ + {Opt_ip, "ip=%s"}, + {Opt_noshare, "noshare"}, + {Opt_dirstat, "dirstat"}, + {Opt_nodirstat, "nodirstat"}, + {Opt_rbytes, "rbytes"}, + {Opt_norbytes, "norbytes"}, + {Opt_nocrc, "nocrc"}, + {Opt_noasyncreaddir, "noasyncreaddir"}, + {-1, NULL} +}; + + +static struct ceph_mount_args *parse_mount_args(int flags, char *options, + const char *dev_name, + const char **path) +{ + struct ceph_mount_args *args; + const char *c; + int err = -ENOMEM; + substring_t argstr[MAX_OPT_ARGS]; + + args = kzalloc(sizeof(*args), GFP_KERNEL); + if (!args) + return ERR_PTR(-ENOMEM); + args->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*args->mon_addr), + GFP_KERNEL); + if (!args->mon_addr) + goto out; + + dout("parse_mount_args %p, dev_name '%s'\n", args, dev_name); + + /* start with defaults */ + args->sb_flags = flags; + args->flags = CEPH_OPT_DEFAULT; + args->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT; + args->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT; + args->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */ + args->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; /* seconds */ + args->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT; + args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT; + args->rsize = CEPH_MOUNT_RSIZE_DEFAULT; + args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); + args->cap_release_safety = CEPH_CAPS_PER_RELEASE * 4; + args->max_readdir = 1024; + args->congestion_kb = default_congestion_kb(); + + /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */ + err = -EINVAL; + if (!dev_name) + goto out; + *path = strstr(dev_name, ":/"); + if (*path == NULL) { + pr_err("device name is missing path (no :/ in %s)\n", + dev_name); + goto out; + } + + /* get mon ip(s) */ + err = ceph_parse_ips(dev_name, *path, args->mon_addr, + CEPH_MAX_MON, &args->num_mon); + if (err < 0) + goto out; + + /* path on server */ + *path += 2; + dout("server path '%s'\n", *path); + + /* parse mount options */ + while ((c = strsep(&options, ",")) != NULL) { + int token, intval, ret; + if (!*c) + continue; + err = -EINVAL; + token = match_token((char *)c, arg_tokens, argstr); + if (token < 0) { + pr_err("bad mount option at '%s'\n", c); + goto out; + } + if (token < Opt_last_int) { + ret = match_int(&argstr[0], &intval); + if (ret < 0) { + pr_err("bad mount option arg (not int) " + "at '%s'\n", c); + continue; + } + dout("got int token %d val %d\n", token, intval); + } else if (token > Opt_last_int && token < Opt_last_string) { + dout("got string token %d val %s\n", token, + argstr[0].from); + } else { + dout("got token %d\n", token); + } + switch (token) { + case Opt_fsidmajor: + *(__le64 *)&args->fsid.fsid[0] = cpu_to_le64(intval); + break; + case Opt_fsidminor: + *(__le64 *)&args->fsid.fsid[8] = cpu_to_le64(intval); + break; + case Opt_ip: + err = ceph_parse_ips(argstr[0].from, + argstr[0].to, + &args->my_addr, + 1, NULL); + if (err < 0) + goto out; + args->flags |= CEPH_OPT_MYIP; + break; + + case Opt_snapdirname: + kfree(args->snapdir_name); + args->snapdir_name = kstrndup(argstr[0].from, + argstr[0].to-argstr[0].from, + GFP_KERNEL); + break; + case Opt_name: + args->name = kstrndup(argstr[0].from, + argstr[0].to-argstr[0].from, + GFP_KERNEL); + break; + case Opt_secret: + args->secret = kstrndup(argstr[0].from, + argstr[0].to-argstr[0].from, + GFP_KERNEL); + break; + + /* misc */ + case Opt_wsize: + args->wsize = intval; + break; + case Opt_rsize: + args->rsize = intval; + break; + case Opt_osdtimeout: + args->osd_timeout = intval; + break; + case Opt_osdkeepalivetimeout: + args->osd_keepalive_timeout = intval; + break; + case Opt_mount_timeout: + args->mount_timeout = intval; + break; + case Opt_caps_wanted_delay_min: + args->caps_wanted_delay_min = intval; + break; + case Opt_caps_wanted_delay_max: + args->caps_wanted_delay_max = intval; + break; + case Opt_readdir_max_entries: + args->max_readdir = intval; + break; + case Opt_congestion_kb: + args->congestion_kb = intval; + break; + + case Opt_noshare: + args->flags |= CEPH_OPT_NOSHARE; + break; + + case Opt_dirstat: + args->flags |= CEPH_OPT_DIRSTAT; + break; + case Opt_nodirstat: + args->flags &= ~CEPH_OPT_DIRSTAT; + break; + case Opt_rbytes: + args->flags |= CEPH_OPT_RBYTES; + break; + case Opt_norbytes: + args->flags &= ~CEPH_OPT_RBYTES; + break; + case Opt_nocrc: + args->flags |= CEPH_OPT_NOCRC; + break; + case Opt_noasyncreaddir: + args->flags |= CEPH_OPT_NOASYNCREADDIR; + break; + + default: + BUG_ON(token); + } + } + return args; + +out: + kfree(args->mon_addr); + kfree(args); + return ERR_PTR(err); +} + +static void destroy_mount_args(struct ceph_mount_args *args) +{ + dout("destroy_mount_args %p\n", args); + kfree(args->snapdir_name); + args->snapdir_name = NULL; + kfree(args->name); + args->name = NULL; + kfree(args->secret); + args->secret = NULL; + kfree(args); +} + +/* + * create a fresh client instance + */ +static struct ceph_client *ceph_create_client(struct ceph_mount_args *args) +{ + struct ceph_client *client; + int err = -ENOMEM; + + client = kzalloc(sizeof(*client), GFP_KERNEL); + if (client == NULL) + return ERR_PTR(-ENOMEM); + + mutex_init(&client->mount_mutex); + + init_waitqueue_head(&client->auth_wq); + + client->sb = NULL; + client->mount_state = CEPH_MOUNT_MOUNTING; + client->mount_args = args; + + client->msgr = NULL; + + client->auth_err = 0; + atomic_long_set(&client->writeback_count, 0); + + err = bdi_init(&client->backing_dev_info); + if (err < 0) + goto fail; + + err = -ENOMEM; + client->wb_wq = create_workqueue("ceph-writeback"); + if (client->wb_wq == NULL) + goto fail_bdi; + client->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid"); + if (client->pg_inv_wq == NULL) + goto fail_wb_wq; + client->trunc_wq = create_singlethread_workqueue("ceph-trunc"); + if (client->trunc_wq == NULL) + goto fail_pg_inv_wq; + + /* set up mempools */ + err = -ENOMEM; + client->wb_pagevec_pool = mempool_create_kmalloc_pool(10, + client->mount_args->wsize >> PAGE_CACHE_SHIFT); + if (!client->wb_pagevec_pool) + goto fail_trunc_wq; + + /* caps */ + client->min_caps = args->max_readdir; + ceph_adjust_min_caps(client->min_caps); + + /* subsystems */ + err = ceph_monc_init(&client->monc, client); + if (err < 0) + goto fail_mempool; + err = ceph_osdc_init(&client->osdc, client); + if (err < 0) + goto fail_monc; + err = ceph_mdsc_init(&client->mdsc, client); + if (err < 0) + goto fail_osdc; + return client; + +fail_osdc: + ceph_osdc_stop(&client->osdc); +fail_monc: + ceph_monc_stop(&client->monc); +fail_mempool: + mempool_destroy(client->wb_pagevec_pool); +fail_trunc_wq: + destroy_workqueue(client->trunc_wq); +fail_pg_inv_wq: + destroy_workqueue(client->pg_inv_wq); +fail_wb_wq: + destroy_workqueue(client->wb_wq); +fail_bdi: + bdi_destroy(&client->backing_dev_info); +fail: + kfree(client); + return ERR_PTR(err); +} + +static void ceph_destroy_client(struct ceph_client *client) +{ + dout("destroy_client %p\n", client); + + /* unmount */ + ceph_mdsc_stop(&client->mdsc); + ceph_monc_stop(&client->monc); + ceph_osdc_stop(&client->osdc); + + ceph_adjust_min_caps(-client->min_caps); + + ceph_debugfs_client_cleanup(client); + destroy_workqueue(client->wb_wq); + destroy_workqueue(client->pg_inv_wq); + destroy_workqueue(client->trunc_wq); + + if (client->msgr) + ceph_messenger_destroy(client->msgr); + mempool_destroy(client->wb_pagevec_pool); + + destroy_mount_args(client->mount_args); + + kfree(client); + dout("destroy_client %p done\n", client); +} + +/* + * Initially learn our fsid, or verify an fsid matches. + */ +int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid) +{ + if (client->have_fsid) { + if (ceph_fsid_compare(&client->fsid, fsid)) { + pr_err("bad fsid, had " FSID_FORMAT " got " FSID_FORMAT, + PR_FSID(&client->fsid), PR_FSID(fsid)); + return -1; + } + } else { + pr_info("client%lld fsid " FSID_FORMAT "\n", + client->monc.auth->global_id, PR_FSID(fsid)); + memcpy(&client->fsid, fsid, sizeof(*fsid)); + ceph_debugfs_client_init(client); + client->have_fsid = true; + } + return 0; +} + +/* + * true if we have the mon map (and have thus joined the cluster) + */ +static int have_mon_map(struct ceph_client *client) +{ + return client->monc.monmap && client->monc.monmap->epoch; +} + +/* + * Bootstrap mount by opening the root directory. Note the mount + * @started time from caller, and time out if this takes too long. + */ +static struct dentry *open_root_dentry(struct ceph_client *client, + const char *path, + unsigned long started) +{ + struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_mds_request *req = NULL; + int err; + struct dentry *root; + + /* open dir */ + dout("open_root_inode opening '%s'\n", path); + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS); + if (IS_ERR(req)) + return ERR_PTR(PTR_ERR(req)); + req->r_path1 = kstrdup(path, GFP_NOFS); + req->r_ino1.ino = CEPH_INO_ROOT; + req->r_ino1.snap = CEPH_NOSNAP; + req->r_started = started; + req->r_timeout = client->mount_args->mount_timeout * HZ; + req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE); + req->r_num_caps = 2; + err = ceph_mdsc_do_request(mdsc, NULL, req); + if (err == 0) { + dout("open_root_inode success\n"); + if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT && + client->sb->s_root == NULL) + root = d_alloc_root(req->r_target_inode); + else + root = d_obtain_alias(req->r_target_inode); + req->r_target_inode = NULL; + dout("open_root_inode success, root dentry is %p\n", root); + } else { + root = ERR_PTR(err); + } + ceph_mdsc_put_request(req); + return root; +} + +/* + * mount: join the ceph cluster, and open root directory. + */ +static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt, + const char *path) +{ + struct ceph_entity_addr *myaddr = NULL; + int err; + unsigned long timeout = client->mount_args->mount_timeout * HZ; + unsigned long started = jiffies; /* note the start time */ + struct dentry *root; + + dout("mount start\n"); + mutex_lock(&client->mount_mutex); + + /* initialize the messenger */ + if (client->msgr == NULL) { + if (ceph_test_opt(client, MYIP)) + myaddr = &client->mount_args->my_addr; + client->msgr = ceph_messenger_create(myaddr); + if (IS_ERR(client->msgr)) { + err = PTR_ERR(client->msgr); + client->msgr = NULL; + goto out; + } + client->msgr->nocrc = ceph_test_opt(client, NOCRC); + } + + /* open session, and wait for mon, mds, and osd maps */ + err = ceph_monc_open_session(&client->monc); + if (err < 0) + goto out; + + while (!have_mon_map(client)) { + err = -EIO; + if (timeout && time_after_eq(jiffies, started + timeout)) + goto out; + + /* wait */ + dout("mount waiting for mon_map\n"); + err = wait_event_interruptible_timeout(client->auth_wq, + have_mon_map(client) || (client->auth_err < 0), + timeout); + if (err == -EINTR || err == -ERESTARTSYS) + goto out; + if (client->auth_err < 0) { + err = client->auth_err; + goto out; + } + } + + dout("mount opening root\n"); + root = open_root_dentry(client, "", started); + if (IS_ERR(root)) { + err = PTR_ERR(root); + goto out; + } + if (client->sb->s_root) + dput(root); + else + client->sb->s_root = root; + + if (path[0] == 0) { + dget(root); + } else { + dout("mount opening base mountpoint\n"); + root = open_root_dentry(client, path, started); + if (IS_ERR(root)) { + err = PTR_ERR(root); + dput(client->sb->s_root); + client->sb->s_root = NULL; + goto out; + } + } + + mnt->mnt_root = root; + mnt->mnt_sb = client->sb; + + client->mount_state = CEPH_MOUNT_MOUNTED; + dout("mount success\n"); + err = 0; + +out: + mutex_unlock(&client->mount_mutex); + return err; +} + +static int ceph_set_super(struct super_block *s, void *data) +{ + struct ceph_client *client = data; + int ret; + + dout("set_super %p data %p\n", s, data); + + s->s_flags = client->mount_args->sb_flags; + s->s_maxbytes = 1ULL << 40; /* temp value until we get mdsmap */ + + s->s_fs_info = client; + client->sb = s; + + s->s_op = &ceph_super_ops; + s->s_export_op = &ceph_export_ops; + + s->s_time_gran = 1000; /* 1000 ns == 1 us */ + + ret = set_anon_super(s, NULL); /* what is that second arg for? */ + if (ret != 0) + goto fail; + + return ret; + +fail: + s->s_fs_info = NULL; + client->sb = NULL; + return ret; +} + +/* + * share superblock if same fs AND options + */ +static int ceph_compare_super(struct super_block *sb, void *data) +{ + struct ceph_client *new = data; + struct ceph_mount_args *args = new->mount_args; + struct ceph_client *other = ceph_sb_to_client(sb); + int i; + + dout("ceph_compare_super %p\n", sb); + if (args->flags & CEPH_OPT_FSID) { + if (ceph_fsid_compare(&args->fsid, &other->fsid)) { + dout("fsid doesn't match\n"); + return 0; + } + } else { + /* do we share (a) monitor? */ + for (i = 0; i < new->monc.monmap->num_mon; i++) + if (ceph_monmap_contains(other->monc.monmap, + &new->monc.monmap->mon_inst[i].addr)) + break; + if (i == new->monc.monmap->num_mon) { + dout("mon ip not part of monmap\n"); + return 0; + } + dout("mon ip matches existing sb %p\n", sb); + } + if (args->sb_flags != other->mount_args->sb_flags) { + dout("flags differ\n"); + return 0; + } + return 1; +} + +/* + * construct our own bdi so we can control readahead, etc. + */ +static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client) +{ + int err; + + sb->s_bdi = &client->backing_dev_info; + + /* set ra_pages based on rsize mount option? */ + if (client->mount_args->rsize >= PAGE_CACHE_SIZE) + client->backing_dev_info.ra_pages = + (client->mount_args->rsize + PAGE_CACHE_SIZE - 1) + >> PAGE_SHIFT; + err = bdi_register_dev(&client->backing_dev_info, sb->s_dev); + return err; +} + +static int ceph_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, + struct vfsmount *mnt) +{ + struct super_block *sb; + struct ceph_client *client; + int err; + int (*compare_super)(struct super_block *, void *) = ceph_compare_super; + const char *path = NULL; + struct ceph_mount_args *args; + + dout("ceph_get_sb\n"); + args = parse_mount_args(flags, data, dev_name, &path); + if (IS_ERR(args)) { + err = PTR_ERR(args); + goto out_final; + } + + /* create client (which we may/may not use) */ + client = ceph_create_client(args); + if (IS_ERR(client)) { + err = PTR_ERR(client); + goto out_final; + } + + if (client->mount_args->flags & CEPH_OPT_NOSHARE) + compare_super = NULL; + sb = sget(fs_type, compare_super, ceph_set_super, client); + if (IS_ERR(sb)) { + err = PTR_ERR(sb); + goto out; + } + + if (ceph_client(sb) != client) { + ceph_destroy_client(client); + client = ceph_client(sb); + dout("get_sb got existing client %p\n", client); + } else { + dout("get_sb using new client %p\n", client); + err = ceph_register_bdi(sb, client); + if (err < 0) + goto out_splat; + } + + err = ceph_mount(client, mnt, path); + if (err < 0) + goto out_splat; + dout("root %p inode %p ino %llx.%llx\n", mnt->mnt_root, + mnt->mnt_root->d_inode, ceph_vinop(mnt->mnt_root->d_inode)); + return 0; + +out_splat: + ceph_mdsc_close_sessions(&client->mdsc); + up_write(&sb->s_umount); + deactivate_super(sb); + goto out_final; + +out: + ceph_destroy_client(client); +out_final: + dout("ceph_get_sb fail %d\n", err); + return err; +} + +static void ceph_kill_sb(struct super_block *s) +{ + struct ceph_client *client = ceph_sb_to_client(s); + dout("kill_sb %p\n", s); + ceph_mdsc_pre_umount(&client->mdsc); + kill_anon_super(s); /* will call put_super after sb is r/o */ + if (s->s_bdi == &client->backing_dev_info) + bdi_unregister(&client->backing_dev_info); + bdi_destroy(&client->backing_dev_info); + ceph_destroy_client(client); +} + +static struct file_system_type ceph_fs_type = { + .owner = THIS_MODULE, + .name = "ceph", + .get_sb = ceph_get_sb, + .kill_sb = ceph_kill_sb, + .fs_flags = FS_RENAME_DOES_D_MOVE, +}; + +#define _STRINGIFY(x) #x +#define STRINGIFY(x) _STRINGIFY(x) + +static int __init init_ceph(void) +{ + int ret = 0; + + ret = ceph_debugfs_init(); + if (ret < 0) + goto out; + + ret = ceph_msgr_init(); + if (ret < 0) + goto out_debugfs; + + ret = init_caches(); + if (ret) + goto out_msgr; + + ceph_caps_init(); + + ret = register_filesystem(&ceph_fs_type); + if (ret) + goto out_icache; + + pr_info("loaded %d.%d.%d (mon/mds/osd proto %d/%d/%d)\n", + CEPH_VERSION_MAJOR, CEPH_VERSION_MINOR, CEPH_VERSION_PATCH, + CEPH_MONC_PROTOCOL, CEPH_MDSC_PROTOCOL, CEPH_OSDC_PROTOCOL); + return 0; + +out_icache: + destroy_caches(); +out_msgr: + ceph_msgr_exit(); +out_debugfs: + ceph_debugfs_cleanup(); +out: + return ret; +} + +static void __exit exit_ceph(void) +{ + dout("exit_ceph\n"); + unregister_filesystem(&ceph_fs_type); + ceph_caps_finalize(); + destroy_caches(); + ceph_msgr_exit(); + ceph_debugfs_cleanup(); +} + +module_init(init_ceph); +module_exit(exit_ceph); + +MODULE_AUTHOR("Sage Weil <sage@newdream.net>"); +MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>"); +MODULE_AUTHOR("Patience Warnick <patience@newdream.net>"); +MODULE_DESCRIPTION("Ceph filesystem for Linux"); +MODULE_LICENSE("GPL"); diff --git a/fs/ceph/super.h b/fs/ceph/super.h new file mode 100644 index 00000000000..65d12036b67 --- /dev/null +++ b/fs/ceph/super.h @@ -0,0 +1,901 @@ +#ifndef _FS_CEPH_SUPER_H +#define _FS_CEPH_SUPER_H + +#include "ceph_debug.h" + +#include <asm/unaligned.h> +#include <linux/backing-dev.h> +#include <linux/completion.h> +#include <linux/exportfs.h> +#include <linux/fs.h> +#include <linux/mempool.h> +#include <linux/pagemap.h> +#include <linux/wait.h> +#include <linux/writeback.h> + +#include "types.h" +#include "messenger.h" +#include "msgpool.h" +#include "mon_client.h" +#include "mds_client.h" +#include "osd_client.h" +#include "ceph_fs.h" + +/* f_type in struct statfs */ +#define CEPH_SUPER_MAGIC 0x00c36400 + +/* large granularity for statfs utilization stats to facilitate + * large volume sizes on 32-bit machines. */ +#define CEPH_BLOCK_SHIFT 20 /* 1 MB */ +#define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT) + +/* + * mount options + */ +#define CEPH_OPT_FSID (1<<0) +#define CEPH_OPT_NOSHARE (1<<1) /* don't share client with other sbs */ +#define CEPH_OPT_MYIP (1<<2) /* specified my ip */ +#define CEPH_OPT_DIRSTAT (1<<4) /* funky `cat dirname` for stats */ +#define CEPH_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */ +#define CEPH_OPT_NOCRC (1<<6) /* no data crc on writes */ +#define CEPH_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */ + +#define CEPH_OPT_DEFAULT (CEPH_OPT_RBYTES) + +#define ceph_set_opt(client, opt) \ + (client)->mount_args->flags |= CEPH_OPT_##opt; +#define ceph_test_opt(client, opt) \ + (!!((client)->mount_args->flags & CEPH_OPT_##opt)) + + +struct ceph_mount_args { + int sb_flags; + int num_mon; + struct ceph_entity_addr *mon_addr; + int flags; + int mount_timeout; + int osd_idle_ttl; + int caps_wanted_delay_min, caps_wanted_delay_max; + struct ceph_fsid fsid; + struct ceph_entity_addr my_addr; + int wsize; + int rsize; /* max readahead */ + int max_readdir; /* max readdir size */ + int congestion_kb; /* max readdir size */ + int osd_timeout; + int osd_keepalive_timeout; + char *snapdir_name; /* default ".snap" */ + char *name; + char *secret; + int cap_release_safety; +}; + +/* + * defaults + */ +#define CEPH_MOUNT_TIMEOUT_DEFAULT 60 +#define CEPH_OSD_TIMEOUT_DEFAULT 60 /* seconds */ +#define CEPH_OSD_KEEPALIVE_DEFAULT 5 +#define CEPH_OSD_IDLE_TTL_DEFAULT 60 +#define CEPH_MOUNT_RSIZE_DEFAULT (512*1024) /* readahead */ + +#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024) +#define CEPH_MSG_MAX_DATA_LEN (16*1024*1024) + +#define CEPH_SNAPDIRNAME_DEFAULT ".snap" +#define CEPH_AUTH_NAME_DEFAULT "guest" + +/* + * Delay telling the MDS we no longer want caps, in case we reopen + * the file. Delay a minimum amount of time, even if we send a cap + * message for some other reason. Otherwise, take the oppotunity to + * update the mds to avoid sending another message later. + */ +#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */ +#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */ + + +/* mount state */ +enum { + CEPH_MOUNT_MOUNTING, + CEPH_MOUNT_MOUNTED, + CEPH_MOUNT_UNMOUNTING, + CEPH_MOUNT_UNMOUNTED, + CEPH_MOUNT_SHUTDOWN, +}; + +/* + * subtract jiffies + */ +static inline unsigned long time_sub(unsigned long a, unsigned long b) +{ + BUG_ON(time_after(b, a)); + return (long)a - (long)b; +} + +/* + * per-filesystem client state + * + * possibly shared by multiple mount points, if they are + * mounting the same ceph filesystem/cluster. + */ +struct ceph_client { + struct ceph_fsid fsid; + bool have_fsid; + + struct mutex mount_mutex; /* serialize mount attempts */ + struct ceph_mount_args *mount_args; + + struct super_block *sb; + + unsigned long mount_state; + wait_queue_head_t auth_wq; + + int auth_err; + + int min_caps; /* min caps i added */ + + struct ceph_messenger *msgr; /* messenger instance */ + struct ceph_mon_client monc; + struct ceph_mds_client mdsc; + struct ceph_osd_client osdc; + + /* writeback */ + mempool_t *wb_pagevec_pool; + struct workqueue_struct *wb_wq; + struct workqueue_struct *pg_inv_wq; + struct workqueue_struct *trunc_wq; + atomic_long_t writeback_count; + + struct backing_dev_info backing_dev_info; + +#ifdef CONFIG_DEBUG_FS + struct dentry *debugfs_monmap; + struct dentry *debugfs_mdsmap, *debugfs_osdmap; + struct dentry *debugfs_dir, *debugfs_dentry_lru, *debugfs_caps; + struct dentry *debugfs_congestion_kb; + struct dentry *debugfs_bdi; +#endif +}; + +static inline struct ceph_client *ceph_client(struct super_block *sb) +{ + return sb->s_fs_info; +} + + +/* + * File i/o capability. This tracks shared state with the metadata + * server that allows us to cache or writeback attributes or to read + * and write data. For any given inode, we should have one or more + * capabilities, one issued by each metadata server, and our + * cumulative access is the OR of all issued capabilities. + * + * Each cap is referenced by the inode's i_caps rbtree and by per-mds + * session capability lists. + */ +struct ceph_cap { + struct ceph_inode_info *ci; + struct rb_node ci_node; /* per-ci cap tree */ + struct ceph_mds_session *session; + struct list_head session_caps; /* per-session caplist */ + int mds; + u64 cap_id; /* unique cap id (mds provided) */ + int issued; /* latest, from the mds */ + int implemented; /* implemented superset of issued (for revocation) */ + int mds_wanted; + u32 seq, issue_seq, mseq; + u32 cap_gen; /* active/stale cycle */ + unsigned long last_used; + struct list_head caps_item; +}; + +#define CHECK_CAPS_NODELAY 1 /* do not delay any further */ +#define CHECK_CAPS_AUTHONLY 2 /* only check auth cap */ +#define CHECK_CAPS_FLUSH 4 /* flush any dirty caps */ + +/* + * Snapped cap state that is pending flush to mds. When a snapshot occurs, + * we first complete any in-process sync writes and writeback any dirty + * data before flushing the snapped state (tracked here) back to the MDS. + */ +struct ceph_cap_snap { + atomic_t nref; + struct ceph_inode_info *ci; + struct list_head ci_item, flushing_item; + + u64 follows, flush_tid; + int issued, dirty; + struct ceph_snap_context *context; + + mode_t mode; + uid_t uid; + gid_t gid; + + void *xattr_blob; + int xattr_len; + u64 xattr_version; + + u64 size; + struct timespec mtime, atime, ctime; + u64 time_warp_seq; + int writing; /* a sync write is still in progress */ + int dirty_pages; /* dirty pages awaiting writeback */ +}; + +static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap) +{ + if (atomic_dec_and_test(&capsnap->nref)) + kfree(capsnap); +} + +/* + * The frag tree describes how a directory is fragmented, potentially across + * multiple metadata servers. It is also used to indicate points where + * metadata authority is delegated, and whether/where metadata is replicated. + * + * A _leaf_ frag will be present in the i_fragtree IFF there is + * delegation info. That is, if mds >= 0 || ndist > 0. + */ +#define CEPH_MAX_DIRFRAG_REP 4 + +struct ceph_inode_frag { + struct rb_node node; + + /* fragtree state */ + u32 frag; + int split_by; /* i.e. 2^(split_by) children */ + + /* delegation and replication info */ + int mds; /* -1 if same authority as parent */ + int ndist; /* >0 if replicated */ + int dist[CEPH_MAX_DIRFRAG_REP]; +}; + +/* + * We cache inode xattrs as an encoded blob until they are first used, + * at which point we parse them into an rbtree. + */ +struct ceph_inode_xattr { + struct rb_node node; + + const char *name; + int name_len; + const char *val; + int val_len; + int dirty; + + int should_free_name; + int should_free_val; +}; + +struct ceph_inode_xattrs_info { + /* + * (still encoded) xattr blob. we avoid the overhead of parsing + * this until someone actually calls getxattr, etc. + * + * blob->vec.iov_len == 4 implies there are no xattrs; blob == + * NULL means we don't know. + */ + struct ceph_buffer *blob, *prealloc_blob; + + struct rb_root index; + bool dirty; + int count; + int names_size; + int vals_size; + u64 version, index_version; +}; + +/* + * Ceph inode. + */ +#define CEPH_I_COMPLETE 1 /* we have complete directory cached */ +#define CEPH_I_NODELAY 4 /* do not delay cap release */ +#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */ +#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */ + +struct ceph_inode_info { + struct ceph_vino i_vino; /* ceph ino + snap */ + + u64 i_version; + u32 i_time_warp_seq; + + unsigned i_ceph_flags; + unsigned long i_release_count; + + struct ceph_file_layout i_layout; + char *i_symlink; + + /* for dirs */ + struct timespec i_rctime; + u64 i_rbytes, i_rfiles, i_rsubdirs; + u64 i_files, i_subdirs; + u64 i_max_offset; /* largest readdir offset, set with I_COMPLETE */ + + struct rb_root i_fragtree; + struct mutex i_fragtree_mutex; + + struct ceph_inode_xattrs_info i_xattrs; + + /* capabilities. protected _both_ by i_lock and cap->session's + * s_mutex. */ + struct rb_root i_caps; /* cap list */ + struct ceph_cap *i_auth_cap; /* authoritative cap, if any */ + unsigned i_dirty_caps, i_flushing_caps; /* mask of dirtied fields */ + struct list_head i_dirty_item, i_flushing_item; + u64 i_cap_flush_seq; + /* we need to track cap writeback on a per-cap-bit basis, to allow + * overlapping, pipelined cap flushes to the mds. we can probably + * reduce the tid to 8 bits if we're concerned about inode size. */ + u16 i_cap_flush_last_tid, i_cap_flush_tid[CEPH_CAP_BITS]; + wait_queue_head_t i_cap_wq; /* threads waiting on a capability */ + unsigned long i_hold_caps_min; /* jiffies */ + unsigned long i_hold_caps_max; /* jiffies */ + struct list_head i_cap_delay_list; /* for delayed cap release to mds */ + int i_cap_exporting_mds; /* to handle cap migration between */ + unsigned i_cap_exporting_mseq; /* mds's. */ + unsigned i_cap_exporting_issued; + struct ceph_cap_reservation i_cap_migration_resv; + struct list_head i_cap_snaps; /* snapped state pending flush to mds */ + struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 */ + unsigned i_snap_caps; /* cap bits for snapped files */ + + int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */ + + u32 i_truncate_seq; /* last truncate to smaller size */ + u64 i_truncate_size; /* and the size we last truncated down to */ + int i_truncate_pending; /* still need to call vmtruncate */ + + u64 i_max_size; /* max file size authorized by mds */ + u64 i_reported_size; /* (max_)size reported to or requested of mds */ + u64 i_wanted_max_size; /* offset we'd like to write too */ + u64 i_requested_max_size; /* max_size we've requested */ + + /* held references to caps */ + int i_pin_ref; + int i_rd_ref, i_rdcache_ref, i_wr_ref; + int i_wrbuffer_ref, i_wrbuffer_ref_head; + u32 i_shared_gen; /* increment each time we get FILE_SHARED */ + u32 i_rdcache_gen; /* we increment this each time we get + FILE_CACHE. If it's non-zero, we + _may_ have cached pages. */ + u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */ + + struct list_head i_unsafe_writes; /* uncommitted sync writes */ + struct list_head i_unsafe_dirops; /* uncommitted mds dir ops */ + spinlock_t i_unsafe_lock; + + struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */ + int i_snap_realm_counter; /* snap realm (if caps) */ + struct list_head i_snap_realm_item; + struct list_head i_snap_flush_item; + + struct work_struct i_wb_work; /* writeback work */ + struct work_struct i_pg_inv_work; /* page invalidation work */ + + struct work_struct i_vmtruncate_work; + + struct inode vfs_inode; /* at end */ +}; + +static inline struct ceph_inode_info *ceph_inode(struct inode *inode) +{ + return container_of(inode, struct ceph_inode_info, vfs_inode); +} + +static inline void ceph_i_clear(struct inode *inode, unsigned mask) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + spin_lock(&inode->i_lock); + ci->i_ceph_flags &= ~mask; + spin_unlock(&inode->i_lock); +} + +static inline void ceph_i_set(struct inode *inode, unsigned mask) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + spin_lock(&inode->i_lock); + ci->i_ceph_flags |= mask; + spin_unlock(&inode->i_lock); +} + +static inline bool ceph_i_test(struct inode *inode, unsigned mask) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + bool r; + + smp_mb(); + r = (ci->i_ceph_flags & mask) == mask; + return r; +} + + +/* find a specific frag @f */ +extern struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, + u32 f); + +/* + * choose fragment for value @v. copy frag content to pfrag, if leaf + * exists + */ +extern u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v, + struct ceph_inode_frag *pfrag, + int *found); + +/* + * Ceph dentry state + */ +struct ceph_dentry_info { + struct ceph_mds_session *lease_session; + u32 lease_gen, lease_shared_gen; + u32 lease_seq; + unsigned long lease_renew_after, lease_renew_from; + struct list_head lru; + struct dentry *dentry; + u64 time; + u64 offset; +}; + +static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry) +{ + return (struct ceph_dentry_info *)dentry->d_fsdata; +} + +static inline loff_t ceph_make_fpos(unsigned frag, unsigned off) +{ + return ((loff_t)frag << 32) | (loff_t)off; +} + +/* + * ino_t is <64 bits on many architectures, blech. + * + * don't include snap in ino hash, at least for now. + */ +static inline ino_t ceph_vino_to_ino(struct ceph_vino vino) +{ + ino_t ino = (ino_t)vino.ino; /* ^ (vino.snap << 20); */ +#if BITS_PER_LONG == 32 + ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8; + if (!ino) + ino = 1; +#endif + return ino; +} + +static inline int ceph_set_ino_cb(struct inode *inode, void *data) +{ + ceph_inode(inode)->i_vino = *(struct ceph_vino *)data; + inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data); + return 0; +} + +static inline struct ceph_vino ceph_vino(struct inode *inode) +{ + return ceph_inode(inode)->i_vino; +} + +/* for printf-style formatting */ +#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap + +static inline u64 ceph_ino(struct inode *inode) +{ + return ceph_inode(inode)->i_vino.ino; +} +static inline u64 ceph_snap(struct inode *inode) +{ + return ceph_inode(inode)->i_vino.snap; +} + +static inline int ceph_ino_compare(struct inode *inode, void *data) +{ + struct ceph_vino *pvino = (struct ceph_vino *)data; + struct ceph_inode_info *ci = ceph_inode(inode); + return ci->i_vino.ino == pvino->ino && + ci->i_vino.snap == pvino->snap; +} + +static inline struct inode *ceph_find_inode(struct super_block *sb, + struct ceph_vino vino) +{ + ino_t t = ceph_vino_to_ino(vino); + return ilookup5(sb, t, ceph_ino_compare, &vino); +} + + +/* + * caps helpers + */ +static inline bool __ceph_is_any_real_caps(struct ceph_inode_info *ci) +{ + return !RB_EMPTY_ROOT(&ci->i_caps); +} + +extern int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented); +extern int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int t); +extern int __ceph_caps_issued_other(struct ceph_inode_info *ci, + struct ceph_cap *cap); + +static inline int ceph_caps_issued(struct ceph_inode_info *ci) +{ + int issued; + spin_lock(&ci->vfs_inode.i_lock); + issued = __ceph_caps_issued(ci, NULL); + spin_unlock(&ci->vfs_inode.i_lock); + return issued; +} + +static inline int ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, + int touch) +{ + int r; + spin_lock(&ci->vfs_inode.i_lock); + r = __ceph_caps_issued_mask(ci, mask, touch); + spin_unlock(&ci->vfs_inode.i_lock); + return r; +} + +static inline int __ceph_caps_dirty(struct ceph_inode_info *ci) +{ + return ci->i_dirty_caps | ci->i_flushing_caps; +} +extern void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask); + +extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask); +extern int __ceph_caps_used(struct ceph_inode_info *ci); + +extern int __ceph_caps_file_wanted(struct ceph_inode_info *ci); + +/* + * wanted, by virtue of open file modes AND cap refs (buffered/cached data) + */ +static inline int __ceph_caps_wanted(struct ceph_inode_info *ci) +{ + int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci); + if (w & CEPH_CAP_FILE_BUFFER) + w |= CEPH_CAP_FILE_EXCL; /* we want EXCL if dirty data */ + return w; +} + +/* what the mds thinks we want */ +extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci); + +extern void ceph_caps_init(void); +extern void ceph_caps_finalize(void); +extern void ceph_adjust_min_caps(int delta); +extern int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need); +extern int ceph_unreserve_caps(struct ceph_cap_reservation *ctx); +extern void ceph_reservation_status(struct ceph_client *client, + int *total, int *avail, int *used, + int *reserved, int *min); + +static inline struct ceph_client *ceph_inode_to_client(struct inode *inode) +{ + return (struct ceph_client *)inode->i_sb->s_fs_info; +} + +static inline struct ceph_client *ceph_sb_to_client(struct super_block *sb) +{ + return (struct ceph_client *)sb->s_fs_info; +} + + +/* + * we keep buffered readdir results attached to file->private_data + */ +struct ceph_file_info { + int fmode; /* initialized on open */ + + /* readdir: position within the dir */ + u32 frag; + struct ceph_mds_request *last_readdir; + int at_end; + + /* readdir: position within a frag */ + unsigned offset; /* offset of last chunk, adjusted for . and .. */ + u64 next_offset; /* offset of next chunk (last_name's + 1) */ + char *last_name; /* last entry in previous chunk */ + struct dentry *dentry; /* next dentry (for dcache readdir) */ + unsigned long dir_release_count; + + /* used for -o dirstat read() on directory thing */ + char *dir_info; + int dir_info_len; +}; + + + +/* + * snapshots + */ + +/* + * A "snap context" is the set of existing snapshots when we + * write data. It is used by the OSD to guide its COW behavior. + * + * The ceph_snap_context is refcounted, and attached to each dirty + * page, indicating which context the dirty data belonged when it was + * dirtied. + */ +struct ceph_snap_context { + atomic_t nref; + u64 seq; + int num_snaps; + u64 snaps[]; +}; + +static inline struct ceph_snap_context * +ceph_get_snap_context(struct ceph_snap_context *sc) +{ + /* + printk("get_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref), + atomic_read(&sc->nref)+1); + */ + if (sc) + atomic_inc(&sc->nref); + return sc; +} + +static inline void ceph_put_snap_context(struct ceph_snap_context *sc) +{ + if (!sc) + return; + /* + printk("put_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref), + atomic_read(&sc->nref)-1); + */ + if (atomic_dec_and_test(&sc->nref)) { + /*printk(" deleting snap_context %p\n", sc);*/ + kfree(sc); + } +} + +/* + * A "snap realm" describes a subset of the file hierarchy sharing + * the same set of snapshots that apply to it. The realms themselves + * are organized into a hierarchy, such that children inherit (some of) + * the snapshots of their parents. + * + * All inodes within the realm that have capabilities are linked into a + * per-realm list. + */ +struct ceph_snap_realm { + u64 ino; + atomic_t nref; + struct rb_node node; + + u64 created, seq; + u64 parent_ino; + u64 parent_since; /* snapid when our current parent became so */ + + u64 *prior_parent_snaps; /* snaps inherited from any parents we */ + int num_prior_parent_snaps; /* had prior to parent_since */ + u64 *snaps; /* snaps specific to this realm */ + int num_snaps; + + struct ceph_snap_realm *parent; + struct list_head children; /* list of child realms */ + struct list_head child_item; + + struct list_head empty_item; /* if i have ref==0 */ + + /* the current set of snaps for this realm */ + struct ceph_snap_context *cached_context; + + struct list_head inodes_with_caps; + spinlock_t inodes_with_caps_lock; +}; + + + +/* + * calculate the number of pages a given length and offset map onto, + * if we align the data. + */ +static inline int calc_pages_for(u64 off, u64 len) +{ + return ((off+len+PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) - + (off >> PAGE_CACHE_SHIFT); +} + + + +/* snap.c */ +struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc, + u64 ino); +extern void ceph_get_snap_realm(struct ceph_mds_client *mdsc, + struct ceph_snap_realm *realm); +extern void ceph_put_snap_realm(struct ceph_mds_client *mdsc, + struct ceph_snap_realm *realm); +extern int ceph_update_snap_trace(struct ceph_mds_client *m, + void *p, void *e, bool deletion); +extern void ceph_handle_snap(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, + struct ceph_msg *msg); +extern void ceph_queue_cap_snap(struct ceph_inode_info *ci, + struct ceph_snap_context *snapc); +extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci, + struct ceph_cap_snap *capsnap); +extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc); + +/* + * a cap_snap is "pending" if it is still awaiting an in-progress + * sync write (that may/may not still update size, mtime, etc.). + */ +static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci) +{ + return !list_empty(&ci->i_cap_snaps) && + list_entry(ci->i_cap_snaps.prev, struct ceph_cap_snap, + ci_item)->writing; +} + + +/* super.c */ +extern struct kmem_cache *ceph_inode_cachep; +extern struct kmem_cache *ceph_cap_cachep; +extern struct kmem_cache *ceph_dentry_cachep; +extern struct kmem_cache *ceph_file_cachep; + +extern const char *ceph_msg_type_name(int type); +extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid); + +#define FSID_FORMAT "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-" \ + "%02x%02x%02x%02x%02x%02x" +#define PR_FSID(f) (f)->fsid[0], (f)->fsid[1], (f)->fsid[2], (f)->fsid[3], \ + (f)->fsid[4], (f)->fsid[5], (f)->fsid[6], (f)->fsid[7], \ + (f)->fsid[8], (f)->fsid[9], (f)->fsid[10], (f)->fsid[11], \ + (f)->fsid[12], (f)->fsid[13], (f)->fsid[14], (f)->fsid[15] + +/* inode.c */ +extern const struct inode_operations ceph_file_iops; + +extern struct inode *ceph_alloc_inode(struct super_block *sb); +extern void ceph_destroy_inode(struct inode *inode); + +extern struct inode *ceph_get_inode(struct super_block *sb, + struct ceph_vino vino); +extern struct inode *ceph_get_snapdir(struct inode *parent); +extern int ceph_fill_file_size(struct inode *inode, int issued, + u32 truncate_seq, u64 truncate_size, u64 size); +extern void ceph_fill_file_time(struct inode *inode, int issued, + u64 time_warp_seq, struct timespec *ctime, + struct timespec *mtime, struct timespec *atime); +extern int ceph_fill_trace(struct super_block *sb, + struct ceph_mds_request *req, + struct ceph_mds_session *session); +extern int ceph_readdir_prepopulate(struct ceph_mds_request *req, + struct ceph_mds_session *session); + +extern int ceph_inode_holds_cap(struct inode *inode, int mask); + +extern int ceph_inode_set_size(struct inode *inode, loff_t size); +extern void __ceph_do_pending_vmtruncate(struct inode *inode); +extern void ceph_queue_vmtruncate(struct inode *inode); + +extern void ceph_queue_invalidate(struct inode *inode); +extern void ceph_queue_writeback(struct inode *inode); + +extern int ceph_do_getattr(struct inode *inode, int mask); +extern int ceph_permission(struct inode *inode, int mask); +extern int ceph_setattr(struct dentry *dentry, struct iattr *attr); +extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat); + +/* xattr.c */ +extern int ceph_setxattr(struct dentry *, const char *, const void *, + size_t, int); +extern ssize_t ceph_getxattr(struct dentry *, const char *, void *, size_t); +extern ssize_t ceph_listxattr(struct dentry *, char *, size_t); +extern int ceph_removexattr(struct dentry *, const char *); +extern void __ceph_build_xattrs_blob(struct ceph_inode_info *ci); +extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci); + +/* caps.c */ +extern const char *ceph_cap_string(int c); +extern void ceph_handle_caps(struct ceph_mds_session *session, + struct ceph_msg *msg); +extern int ceph_add_cap(struct inode *inode, + struct ceph_mds_session *session, u64 cap_id, + int fmode, unsigned issued, unsigned wanted, + unsigned cap, unsigned seq, u64 realmino, int flags, + struct ceph_cap_reservation *caps_reservation); +extern void __ceph_remove_cap(struct ceph_cap *cap); +static inline void ceph_remove_cap(struct ceph_cap *cap) +{ + struct inode *inode = &cap->ci->vfs_inode; + spin_lock(&inode->i_lock); + __ceph_remove_cap(cap); + spin_unlock(&inode->i_lock); +} +extern void ceph_put_cap(struct ceph_cap *cap); + +extern void ceph_queue_caps_release(struct inode *inode); +extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc); +extern int ceph_fsync(struct file *file, struct dentry *dentry, int datasync); +extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session); +extern int ceph_get_cap_mds(struct inode *inode); +extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps); +extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had); +extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, + struct ceph_snap_context *snapc); +extern void __ceph_flush_snaps(struct ceph_inode_info *ci, + struct ceph_mds_session **psession); +extern void ceph_check_caps(struct ceph_inode_info *ci, int flags, + struct ceph_mds_session *session); +extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc); +extern void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc); + +extern int ceph_encode_inode_release(void **p, struct inode *inode, + int mds, int drop, int unless, int force); +extern int ceph_encode_dentry_release(void **p, struct dentry *dn, + int mds, int drop, int unless); + +extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, + int *got, loff_t endoff); + +/* for counting open files by mode */ +static inline void __ceph_get_fmode(struct ceph_inode_info *ci, int mode) +{ + ci->i_nr_by_mode[mode]++; +} +extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode); + +/* addr.c */ +extern const struct address_space_operations ceph_aops; +extern int ceph_mmap(struct file *file, struct vm_area_struct *vma); + +/* file.c */ +extern const struct file_operations ceph_file_fops; +extern const struct address_space_operations ceph_aops; +extern int ceph_open(struct inode *inode, struct file *file); +extern struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry, + struct nameidata *nd, int mode, + int locked_dir); +extern int ceph_release(struct inode *inode, struct file *filp); +extern void ceph_release_page_vector(struct page **pages, int num_pages); + +/* dir.c */ +extern const struct file_operations ceph_dir_fops; +extern const struct inode_operations ceph_dir_iops; +extern struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops, + ceph_snapdir_dentry_ops; + +extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry); +extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, + struct dentry *dentry, int err); + +extern void ceph_dentry_lru_add(struct dentry *dn); +extern void ceph_dentry_lru_touch(struct dentry *dn); +extern void ceph_dentry_lru_del(struct dentry *dn); + +/* + * our d_ops vary depending on whether the inode is live, + * snapshotted (read-only), or a virtual ".snap" directory. + */ +int ceph_init_dentry(struct dentry *dentry); + + +/* ioctl.c */ +extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg); + +/* export.c */ +extern const struct export_operations ceph_export_ops; + +/* debugfs.c */ +extern int ceph_debugfs_init(void); +extern void ceph_debugfs_cleanup(void); +extern int ceph_debugfs_client_init(struct ceph_client *client); +extern void ceph_debugfs_client_cleanup(struct ceph_client *client); + +static inline struct inode *get_dentry_parent_inode(struct dentry *dentry) +{ + if (dentry && dentry->d_parent) + return dentry->d_parent->d_inode; + + return NULL; +} + +#endif /* _FS_CEPH_SUPER_H */ diff --git a/fs/ceph/types.h b/fs/ceph/types.h new file mode 100644 index 00000000000..28b35a005ec --- /dev/null +++ b/fs/ceph/types.h @@ -0,0 +1,29 @@ +#ifndef _FS_CEPH_TYPES_H +#define _FS_CEPH_TYPES_H + +/* needed before including ceph_fs.h */ +#include <linux/in.h> +#include <linux/types.h> +#include <linux/fcntl.h> +#include <linux/string.h> + +#include "ceph_fs.h" +#include "ceph_frag.h" +#include "ceph_hash.h" + +/* + * Identify inodes by both their ino AND snapshot id (a u64). + */ +struct ceph_vino { + u64 ino; + u64 snap; +}; + + +/* context for the caps reservation mechanism */ +struct ceph_cap_reservation { + int count; +}; + + +#endif diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c new file mode 100644 index 00000000000..37d6ce64569 --- /dev/null +++ b/fs/ceph/xattr.c @@ -0,0 +1,844 @@ +#include "ceph_debug.h" +#include "super.h" +#include "decode.h" + +#include <linux/xattr.h> + +static bool ceph_is_valid_xattr(const char *name) +{ + return !strncmp(name, XATTR_SECURITY_PREFIX, + XATTR_SECURITY_PREFIX_LEN) || + !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || + !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); +} + +/* + * These define virtual xattrs exposing the recursive directory + * statistics and layout metadata. + */ +struct ceph_vxattr_cb { + bool readonly; + char *name; + size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val, + size_t size); +}; + +/* directories */ + +static size_t ceph_vxattrcb_entries(struct ceph_inode_info *ci, char *val, + size_t size) +{ + return snprintf(val, size, "%lld", ci->i_files + ci->i_subdirs); +} + +static size_t ceph_vxattrcb_files(struct ceph_inode_info *ci, char *val, + size_t size) +{ + return snprintf(val, size, "%lld", ci->i_files); +} + +static size_t ceph_vxattrcb_subdirs(struct ceph_inode_info *ci, char *val, + size_t size) +{ + return snprintf(val, size, "%lld", ci->i_subdirs); +} + +static size_t ceph_vxattrcb_rentries(struct ceph_inode_info *ci, char *val, + size_t size) +{ + return snprintf(val, size, "%lld", ci->i_rfiles + ci->i_rsubdirs); +} + +static size_t ceph_vxattrcb_rfiles(struct ceph_inode_info *ci, char *val, + size_t size) +{ + return snprintf(val, size, "%lld", ci->i_rfiles); +} + +static size_t ceph_vxattrcb_rsubdirs(struct ceph_inode_info *ci, char *val, + size_t size) +{ + return snprintf(val, size, "%lld", ci->i_rsubdirs); +} + +static size_t ceph_vxattrcb_rbytes(struct ceph_inode_info *ci, char *val, + size_t size) +{ + return snprintf(val, size, "%lld", ci->i_rbytes); +} + +static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val, + size_t size) +{ + return snprintf(val, size, "%ld.%ld", (long)ci->i_rctime.tv_sec, + (long)ci->i_rctime.tv_nsec); +} + +static struct ceph_vxattr_cb ceph_dir_vxattrs[] = { + { true, "user.ceph.dir.entries", ceph_vxattrcb_entries}, + { true, "user.ceph.dir.files", ceph_vxattrcb_files}, + { true, "user.ceph.dir.subdirs", ceph_vxattrcb_subdirs}, + { true, "user.ceph.dir.rentries", ceph_vxattrcb_rentries}, + { true, "user.ceph.dir.rfiles", ceph_vxattrcb_rfiles}, + { true, "user.ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs}, + { true, "user.ceph.dir.rbytes", ceph_vxattrcb_rbytes}, + { true, "user.ceph.dir.rctime", ceph_vxattrcb_rctime}, + { true, NULL, NULL } +}; + +/* files */ + +static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, + size_t size) +{ + int ret; + + ret = snprintf(val, size, + "chunk_bytes=%lld\nstripe_count=%lld\nobject_size=%lld\n", + (unsigned long long)ceph_file_layout_su(ci->i_layout), + (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout), + (unsigned long long)ceph_file_layout_object_size(ci->i_layout)); + if (ceph_file_layout_pg_preferred(ci->i_layout)) + ret += snprintf(val + ret, size, "preferred_osd=%lld\n", + (unsigned long long)ceph_file_layout_pg_preferred( + ci->i_layout)); + return ret; +} + +static struct ceph_vxattr_cb ceph_file_vxattrs[] = { + { true, "user.ceph.layout", ceph_vxattrcb_layout}, + { NULL, NULL } +}; + +static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode) +{ + if (S_ISDIR(inode->i_mode)) + return ceph_dir_vxattrs; + else if (S_ISREG(inode->i_mode)) + return ceph_file_vxattrs; + return NULL; +} + +static struct ceph_vxattr_cb *ceph_match_vxattr(struct ceph_vxattr_cb *vxattr, + const char *name) +{ + do { + if (strcmp(vxattr->name, name) == 0) + return vxattr; + vxattr++; + } while (vxattr->name); + return NULL; +} + +static int __set_xattr(struct ceph_inode_info *ci, + const char *name, int name_len, + const char *val, int val_len, + int dirty, + int should_free_name, int should_free_val, + struct ceph_inode_xattr **newxattr) +{ + struct rb_node **p; + struct rb_node *parent = NULL; + struct ceph_inode_xattr *xattr = NULL; + int c; + int new = 0; + + p = &ci->i_xattrs.index.rb_node; + while (*p) { + parent = *p; + xattr = rb_entry(parent, struct ceph_inode_xattr, node); + c = strncmp(name, xattr->name, min(name_len, xattr->name_len)); + if (c < 0) + p = &(*p)->rb_left; + else if (c > 0) + p = &(*p)->rb_right; + else { + if (name_len == xattr->name_len) + break; + else if (name_len < xattr->name_len) + p = &(*p)->rb_left; + else + p = &(*p)->rb_right; + } + xattr = NULL; + } + + if (!xattr) { + new = 1; + xattr = *newxattr; + xattr->name = name; + xattr->name_len = name_len; + xattr->should_free_name = should_free_name; + + ci->i_xattrs.count++; + dout("__set_xattr count=%d\n", ci->i_xattrs.count); + } else { + kfree(*newxattr); + *newxattr = NULL; + if (xattr->should_free_val) + kfree((void *)xattr->val); + + if (should_free_name) { + kfree((void *)name); + name = xattr->name; + } + ci->i_xattrs.names_size -= xattr->name_len; + ci->i_xattrs.vals_size -= xattr->val_len; + } + if (!xattr) { + pr_err("__set_xattr ENOMEM on %p %llx.%llx xattr %s=%s\n", + &ci->vfs_inode, ceph_vinop(&ci->vfs_inode), name, + xattr->val); + return -ENOMEM; + } + ci->i_xattrs.names_size += name_len; + ci->i_xattrs.vals_size += val_len; + if (val) + xattr->val = val; + else + xattr->val = ""; + + xattr->val_len = val_len; + xattr->dirty = dirty; + xattr->should_free_val = (val && should_free_val); + + if (new) { + rb_link_node(&xattr->node, parent, p); + rb_insert_color(&xattr->node, &ci->i_xattrs.index); + dout("__set_xattr_val p=%p\n", p); + } + + dout("__set_xattr_val added %llx.%llx xattr %p %s=%.*s\n", + ceph_vinop(&ci->vfs_inode), xattr, name, val_len, val); + + return 0; +} + +static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci, + const char *name) +{ + struct rb_node **p; + struct rb_node *parent = NULL; + struct ceph_inode_xattr *xattr = NULL; + int c; + + p = &ci->i_xattrs.index.rb_node; + while (*p) { + parent = *p; + xattr = rb_entry(parent, struct ceph_inode_xattr, node); + c = strncmp(name, xattr->name, xattr->name_len); + if (c < 0) + p = &(*p)->rb_left; + else if (c > 0) + p = &(*p)->rb_right; + else { + dout("__get_xattr %s: found %.*s\n", name, + xattr->val_len, xattr->val); + return xattr; + } + } + + dout("__get_xattr %s: not found\n", name); + + return NULL; +} + +static void __free_xattr(struct ceph_inode_xattr *xattr) +{ + BUG_ON(!xattr); + + if (xattr->should_free_name) + kfree((void *)xattr->name); + if (xattr->should_free_val) + kfree((void *)xattr->val); + + kfree(xattr); +} + +static int __remove_xattr(struct ceph_inode_info *ci, + struct ceph_inode_xattr *xattr) +{ + if (!xattr) + return -EOPNOTSUPP; + + rb_erase(&xattr->node, &ci->i_xattrs.index); + + if (xattr->should_free_name) + kfree((void *)xattr->name); + if (xattr->should_free_val) + kfree((void *)xattr->val); + + ci->i_xattrs.names_size -= xattr->name_len; + ci->i_xattrs.vals_size -= xattr->val_len; + ci->i_xattrs.count--; + kfree(xattr); + + return 0; +} + +static int __remove_xattr_by_name(struct ceph_inode_info *ci, + const char *name) +{ + struct rb_node **p; + struct ceph_inode_xattr *xattr; + int err; + + p = &ci->i_xattrs.index.rb_node; + xattr = __get_xattr(ci, name); + err = __remove_xattr(ci, xattr); + return err; +} + +static char *__copy_xattr_names(struct ceph_inode_info *ci, + char *dest) +{ + struct rb_node *p; + struct ceph_inode_xattr *xattr = NULL; + + p = rb_first(&ci->i_xattrs.index); + dout("__copy_xattr_names count=%d\n", ci->i_xattrs.count); + + while (p) { + xattr = rb_entry(p, struct ceph_inode_xattr, node); + memcpy(dest, xattr->name, xattr->name_len); + dest[xattr->name_len] = '\0'; + + dout("dest=%s %p (%s) (%d/%d)\n", dest, xattr, xattr->name, + xattr->name_len, ci->i_xattrs.names_size); + + dest += xattr->name_len + 1; + p = rb_next(p); + } + + return dest; +} + +void __ceph_destroy_xattrs(struct ceph_inode_info *ci) +{ + struct rb_node *p, *tmp; + struct ceph_inode_xattr *xattr = NULL; + + p = rb_first(&ci->i_xattrs.index); + + dout("__ceph_destroy_xattrs p=%p\n", p); + + while (p) { + xattr = rb_entry(p, struct ceph_inode_xattr, node); + tmp = p; + p = rb_next(tmp); + dout("__ceph_destroy_xattrs next p=%p (%.*s)\n", p, + xattr->name_len, xattr->name); + rb_erase(tmp, &ci->i_xattrs.index); + + __free_xattr(xattr); + } + + ci->i_xattrs.names_size = 0; + ci->i_xattrs.vals_size = 0; + ci->i_xattrs.index_version = 0; + ci->i_xattrs.count = 0; + ci->i_xattrs.index = RB_ROOT; +} + +static int __build_xattrs(struct inode *inode) +{ + u32 namelen; + u32 numattr = 0; + void *p, *end; + u32 len; + const char *name, *val; + struct ceph_inode_info *ci = ceph_inode(inode); + int xattr_version; + struct ceph_inode_xattr **xattrs = NULL; + int err = 0; + int i; + + dout("__build_xattrs() len=%d\n", + ci->i_xattrs.blob ? (int)ci->i_xattrs.blob->vec.iov_len : 0); + + if (ci->i_xattrs.index_version >= ci->i_xattrs.version) + return 0; /* already built */ + + __ceph_destroy_xattrs(ci); + +start: + /* updated internal xattr rb tree */ + if (ci->i_xattrs.blob && ci->i_xattrs.blob->vec.iov_len > 4) { + p = ci->i_xattrs.blob->vec.iov_base; + end = p + ci->i_xattrs.blob->vec.iov_len; + ceph_decode_32_safe(&p, end, numattr, bad); + xattr_version = ci->i_xattrs.version; + spin_unlock(&inode->i_lock); + + xattrs = kcalloc(numattr, sizeof(struct ceph_xattr *), + GFP_NOFS); + err = -ENOMEM; + if (!xattrs) + goto bad_lock; + memset(xattrs, 0, numattr*sizeof(struct ceph_xattr *)); + for (i = 0; i < numattr; i++) { + xattrs[i] = kmalloc(sizeof(struct ceph_inode_xattr), + GFP_NOFS); + if (!xattrs[i]) + goto bad_lock; + } + + spin_lock(&inode->i_lock); + if (ci->i_xattrs.version != xattr_version) { + /* lost a race, retry */ + for (i = 0; i < numattr; i++) + kfree(xattrs[i]); + kfree(xattrs); + goto start; + } + err = -EIO; + while (numattr--) { + ceph_decode_32_safe(&p, end, len, bad); + namelen = len; + name = p; + p += len; + ceph_decode_32_safe(&p, end, len, bad); + val = p; + p += len; + + err = __set_xattr(ci, name, namelen, val, len, + 0, 0, 0, &xattrs[numattr]); + + if (err < 0) + goto bad; + } + kfree(xattrs); + } + ci->i_xattrs.index_version = ci->i_xattrs.version; + ci->i_xattrs.dirty = false; + + return err; +bad_lock: + spin_lock(&inode->i_lock); +bad: + if (xattrs) { + for (i = 0; i < numattr; i++) + kfree(xattrs[i]); + kfree(xattrs); + } + ci->i_xattrs.names_size = 0; + return err; +} + +static int __get_required_blob_size(struct ceph_inode_info *ci, int name_size, + int val_size) +{ + /* + * 4 bytes for the length, and additional 4 bytes per each xattr name, + * 4 bytes per each value + */ + int size = 4 + ci->i_xattrs.count*(4 + 4) + + ci->i_xattrs.names_size + + ci->i_xattrs.vals_size; + dout("__get_required_blob_size c=%d names.size=%d vals.size=%d\n", + ci->i_xattrs.count, ci->i_xattrs.names_size, + ci->i_xattrs.vals_size); + + if (name_size) + size += 4 + 4 + name_size + val_size; + + return size; +} + +/* + * If there are dirty xattrs, reencode xattrs into the prealloc_blob + * and swap into place. + */ +void __ceph_build_xattrs_blob(struct ceph_inode_info *ci) +{ + struct rb_node *p; + struct ceph_inode_xattr *xattr = NULL; + void *dest; + + dout("__build_xattrs_blob %p\n", &ci->vfs_inode); + if (ci->i_xattrs.dirty) { + int need = __get_required_blob_size(ci, 0, 0); + + BUG_ON(need > ci->i_xattrs.prealloc_blob->alloc_len); + + p = rb_first(&ci->i_xattrs.index); + dest = ci->i_xattrs.prealloc_blob->vec.iov_base; + + ceph_encode_32(&dest, ci->i_xattrs.count); + while (p) { + xattr = rb_entry(p, struct ceph_inode_xattr, node); + + ceph_encode_32(&dest, xattr->name_len); + memcpy(dest, xattr->name, xattr->name_len); + dest += xattr->name_len; + ceph_encode_32(&dest, xattr->val_len); + memcpy(dest, xattr->val, xattr->val_len); + dest += xattr->val_len; + + p = rb_next(p); + } + + /* adjust buffer len; it may be larger than we need */ + ci->i_xattrs.prealloc_blob->vec.iov_len = + dest - ci->i_xattrs.prealloc_blob->vec.iov_base; + + if (ci->i_xattrs.blob) + ceph_buffer_put(ci->i_xattrs.blob); + ci->i_xattrs.blob = ci->i_xattrs.prealloc_blob; + ci->i_xattrs.prealloc_blob = NULL; + ci->i_xattrs.dirty = false; + } +} + +ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value, + size_t size) +{ + struct inode *inode = dentry->d_inode; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode); + int err; + struct ceph_inode_xattr *xattr; + struct ceph_vxattr_cb *vxattr = NULL; + + if (!ceph_is_valid_xattr(name)) + return -ENODATA; + + /* let's see if a virtual xattr was requested */ + if (vxattrs) + vxattr = ceph_match_vxattr(vxattrs, name); + + spin_lock(&inode->i_lock); + dout("getxattr %p ver=%lld index_ver=%lld\n", inode, + ci->i_xattrs.version, ci->i_xattrs.index_version); + + if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) && + (ci->i_xattrs.index_version >= ci->i_xattrs.version)) { + goto get_xattr; + } else { + spin_unlock(&inode->i_lock); + /* get xattrs from mds (if we don't already have them) */ + err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR); + if (err) + return err; + } + + spin_lock(&inode->i_lock); + + if (vxattr && vxattr->readonly) { + err = vxattr->getxattr_cb(ci, value, size); + goto out; + } + + err = __build_xattrs(inode); + if (err < 0) + goto out; + +get_xattr: + err = -ENODATA; /* == ENOATTR */ + xattr = __get_xattr(ci, name); + if (!xattr) { + if (vxattr) + err = vxattr->getxattr_cb(ci, value, size); + goto out; + } + + err = -ERANGE; + if (size && size < xattr->val_len) + goto out; + + err = xattr->val_len; + if (size == 0) + goto out; + + memcpy(value, xattr->val, xattr->val_len); + +out: + spin_unlock(&inode->i_lock); + return err; +} + +ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size) +{ + struct inode *inode = dentry->d_inode; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode); + u32 vir_namelen = 0; + u32 namelen; + int err; + u32 len; + int i; + + spin_lock(&inode->i_lock); + dout("listxattr %p ver=%lld index_ver=%lld\n", inode, + ci->i_xattrs.version, ci->i_xattrs.index_version); + + if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) && + (ci->i_xattrs.index_version > ci->i_xattrs.version)) { + goto list_xattr; + } else { + spin_unlock(&inode->i_lock); + err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR); + if (err) + return err; + } + + spin_lock(&inode->i_lock); + + err = __build_xattrs(inode); + if (err < 0) + goto out; + +list_xattr: + vir_namelen = 0; + /* include virtual dir xattrs */ + if (vxattrs) + for (i = 0; vxattrs[i].name; i++) + vir_namelen += strlen(vxattrs[i].name) + 1; + /* adding 1 byte per each variable due to the null termination */ + namelen = vir_namelen + ci->i_xattrs.names_size + ci->i_xattrs.count; + err = -ERANGE; + if (size && namelen > size) + goto out; + + err = namelen; + if (size == 0) + goto out; + + names = __copy_xattr_names(ci, names); + + /* virtual xattr names, too */ + if (vxattrs) + for (i = 0; vxattrs[i].name; i++) { + len = sprintf(names, "%s", vxattrs[i].name); + names += len + 1; + } + +out: + spin_unlock(&inode->i_lock); + return err; +} + +static int ceph_sync_setxattr(struct dentry *dentry, const char *name, + const char *value, size_t size, int flags) +{ + struct ceph_client *client = ceph_client(dentry->d_sb); + struct inode *inode = dentry->d_inode; + struct ceph_inode_info *ci = ceph_inode(inode); + struct inode *parent_inode = dentry->d_parent->d_inode; + struct ceph_mds_request *req; + struct ceph_mds_client *mdsc = &client->mdsc; + int err; + int i, nr_pages; + struct page **pages = NULL; + void *kaddr; + + /* copy value into some pages */ + nr_pages = calc_pages_for(0, size); + if (nr_pages) { + pages = kmalloc(sizeof(pages[0])*nr_pages, GFP_NOFS); + if (!pages) + return -ENOMEM; + err = -ENOMEM; + for (i = 0; i < nr_pages; i++) { + pages[i] = alloc_page(GFP_NOFS); + if (!pages[i]) { + nr_pages = i; + goto out; + } + kaddr = kmap(pages[i]); + memcpy(kaddr, value + i*PAGE_CACHE_SIZE, + min(PAGE_CACHE_SIZE, size-i*PAGE_CACHE_SIZE)); + } + } + + dout("setxattr value=%.*s\n", (int)size, value); + + /* do request */ + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETXATTR, + USE_AUTH_MDS); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto out; + } + req->r_inode = igrab(inode); + req->r_inode_drop = CEPH_CAP_XATTR_SHARED; + req->r_num_caps = 1; + req->r_args.setxattr.flags = cpu_to_le32(flags); + req->r_path2 = kstrdup(name, GFP_NOFS); + + req->r_pages = pages; + req->r_num_pages = nr_pages; + req->r_data_len = size; + + dout("xattr.ver (before): %lld\n", ci->i_xattrs.version); + err = ceph_mdsc_do_request(mdsc, parent_inode, req); + ceph_mdsc_put_request(req); + dout("xattr.ver (after): %lld\n", ci->i_xattrs.version); + +out: + if (pages) { + for (i = 0; i < nr_pages; i++) + __free_page(pages[i]); + kfree(pages); + } + return err; +} + +int ceph_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + struct inode *inode = dentry->d_inode; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode); + int err; + int name_len = strlen(name); + int val_len = size; + char *newname = NULL; + char *newval = NULL; + struct ceph_inode_xattr *xattr = NULL; + int issued; + int required_blob_size; + + if (ceph_snap(inode) != CEPH_NOSNAP) + return -EROFS; + + if (!ceph_is_valid_xattr(name)) + return -EOPNOTSUPP; + + if (vxattrs) { + struct ceph_vxattr_cb *vxattr = + ceph_match_vxattr(vxattrs, name); + if (vxattr && vxattr->readonly) + return -EOPNOTSUPP; + } + + /* preallocate memory for xattr name, value, index node */ + err = -ENOMEM; + newname = kmalloc(name_len + 1, GFP_NOFS); + if (!newname) + goto out; + memcpy(newname, name, name_len + 1); + + if (val_len) { + newval = kmalloc(val_len + 1, GFP_NOFS); + if (!newval) + goto out; + memcpy(newval, value, val_len); + newval[val_len] = '\0'; + } + + xattr = kmalloc(sizeof(struct ceph_inode_xattr), GFP_NOFS); + if (!xattr) + goto out; + + spin_lock(&inode->i_lock); +retry: + issued = __ceph_caps_issued(ci, NULL); + if (!(issued & CEPH_CAP_XATTR_EXCL)) + goto do_sync; + __build_xattrs(inode); + + required_blob_size = __get_required_blob_size(ci, name_len, val_len); + + if (!ci->i_xattrs.prealloc_blob || + required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) { + struct ceph_buffer *blob = NULL; + + spin_unlock(&inode->i_lock); + dout(" preaallocating new blob size=%d\n", required_blob_size); + blob = ceph_buffer_new(required_blob_size, GFP_NOFS); + if (!blob) + goto out; + spin_lock(&inode->i_lock); + if (ci->i_xattrs.prealloc_blob) + ceph_buffer_put(ci->i_xattrs.prealloc_blob); + ci->i_xattrs.prealloc_blob = blob; + goto retry; + } + + dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued)); + err = __set_xattr(ci, newname, name_len, newval, + val_len, 1, 1, 1, &xattr); + __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); + ci->i_xattrs.dirty = true; + inode->i_ctime = CURRENT_TIME; + spin_unlock(&inode->i_lock); + + return err; + +do_sync: + spin_unlock(&inode->i_lock); + err = ceph_sync_setxattr(dentry, name, value, size, flags); +out: + kfree(newname); + kfree(newval); + kfree(xattr); + return err; +} + +static int ceph_send_removexattr(struct dentry *dentry, const char *name) +{ + struct ceph_client *client = ceph_client(dentry->d_sb); + struct ceph_mds_client *mdsc = &client->mdsc; + struct inode *inode = dentry->d_inode; + struct inode *parent_inode = dentry->d_parent->d_inode; + struct ceph_mds_request *req; + int err; + + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RMXATTR, + USE_AUTH_MDS); + if (IS_ERR(req)) + return PTR_ERR(req); + req->r_inode = igrab(inode); + req->r_inode_drop = CEPH_CAP_XATTR_SHARED; + req->r_num_caps = 1; + req->r_path2 = kstrdup(name, GFP_NOFS); + + err = ceph_mdsc_do_request(mdsc, parent_inode, req); + ceph_mdsc_put_request(req); + return err; +} + +int ceph_removexattr(struct dentry *dentry, const char *name) +{ + struct inode *inode = dentry->d_inode; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode); + int issued; + int err; + + if (ceph_snap(inode) != CEPH_NOSNAP) + return -EROFS; + + if (!ceph_is_valid_xattr(name)) + return -EOPNOTSUPP; + + if (vxattrs) { + struct ceph_vxattr_cb *vxattr = + ceph_match_vxattr(vxattrs, name); + if (vxattr && vxattr->readonly) + return -EOPNOTSUPP; + } + + spin_lock(&inode->i_lock); + __build_xattrs(inode); + issued = __ceph_caps_issued(ci, NULL); + dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued)); + + if (!(issued & CEPH_CAP_XATTR_EXCL)) + goto do_sync; + + err = __remove_xattr_by_name(ceph_inode(inode), name); + __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); + ci->i_xattrs.dirty = true; + inode->i_ctime = CURRENT_TIME; + + spin_unlock(&inode->i_lock); + + return err; +do_sync: + spin_unlock(&inode->i_lock); + err = ceph_send_removexattr(dentry, name); + return err; +} + |