From 31b8006e1d79e127a776c9414e3e0b5f9508047e Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 6 Oct 2009 11:31:13 -0700 Subject: ceph: messenger library A generic message passing library is used to communicate with all other components in the Ceph file system. The messenger library provides ordered, reliable delivery of messages between two nodes in the system. This implementation is based on TCP. Signed-off-by: Sage Weil --- fs/ceph/messenger.c | 2019 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 2019 insertions(+) create mode 100644 fs/ceph/messenger.c (limited to 'fs/ceph/messenger.c') diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c new file mode 100644 index 00000000000..63f7f135938 --- /dev/null +++ b/fs/ceph/messenger.c @@ -0,0 +1,2019 @@ +#include "ceph_debug.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "super.h" +#include "messenger.h" + +/* + * Ceph uses the messenger to exchange ceph_msg messages with other + * hosts in the system. The messenger provides ordered and reliable + * delivery. We tolerate TCP disconnects by reconnecting (with + * exponential backoff) in the case of a fault (disconnection, bad + * crc, protocol error). Acks allow sent messages to be discarded by + * the sender. + */ + +/* static tag bytes (protocol control messages) */ +static char tag_msg = CEPH_MSGR_TAG_MSG; +static char tag_ack = CEPH_MSGR_TAG_ACK; +static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE; + + +static void queue_con(struct ceph_connection *con); +static void con_work(struct work_struct *); +static void ceph_fault(struct ceph_connection *con); + +const char *ceph_name_type_str(int t) +{ + switch (t) { + case CEPH_ENTITY_TYPE_MON: return "mon"; + case CEPH_ENTITY_TYPE_MDS: return "mds"; + case CEPH_ENTITY_TYPE_OSD: return "osd"; + case CEPH_ENTITY_TYPE_CLIENT: return "client"; + case CEPH_ENTITY_TYPE_ADMIN: return "admin"; + default: return "???"; + } +} + +/* + * nicely render a sockaddr as a string. + */ +#define MAX_ADDR_STR 20 +static char addr_str[MAX_ADDR_STR][40]; +static DEFINE_SPINLOCK(addr_str_lock); +static int last_addr_str; + +const char *pr_addr(const struct sockaddr_storage *ss) +{ + int i; + char *s; + struct sockaddr_in *in4 = (void *)ss; + unsigned char *quad = (void *)&in4->sin_addr.s_addr; + struct sockaddr_in6 *in6 = (void *)ss; + + spin_lock(&addr_str_lock); + i = last_addr_str++; + if (last_addr_str == MAX_ADDR_STR) + last_addr_str = 0; + spin_unlock(&addr_str_lock); + s = addr_str[i]; + + switch (ss->ss_family) { + case AF_INET: + sprintf(s, "%u.%u.%u.%u:%u", + (unsigned int)quad[0], + (unsigned int)quad[1], + (unsigned int)quad[2], + (unsigned int)quad[3], + (unsigned int)ntohs(in4->sin_port)); + break; + + case AF_INET6: + sprintf(s, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x:%u", + in6->sin6_addr.s6_addr16[0], + in6->sin6_addr.s6_addr16[1], + in6->sin6_addr.s6_addr16[2], + in6->sin6_addr.s6_addr16[3], + in6->sin6_addr.s6_addr16[4], + in6->sin6_addr.s6_addr16[5], + in6->sin6_addr.s6_addr16[6], + in6->sin6_addr.s6_addr16[7], + (unsigned int)ntohs(in6->sin6_port)); + break; + + default: + sprintf(s, "(unknown sockaddr family %d)", (int)ss->ss_family); + } + + return s; +} + +/* + * work queue for all reading and writing to/from the socket. + */ +struct workqueue_struct *ceph_msgr_wq; + +int __init ceph_msgr_init(void) +{ + ceph_msgr_wq = create_workqueue("ceph-msgr"); + if (IS_ERR(ceph_msgr_wq)) { + int ret = PTR_ERR(ceph_msgr_wq); + pr_err("msgr_init failed to create workqueue: %d\n", ret); + ceph_msgr_wq = NULL; + return ret; + } + return 0; +} + +void ceph_msgr_exit(void) +{ + destroy_workqueue(ceph_msgr_wq); +} + +/* + * socket callback functions + */ + +/* data available on socket, or listen socket received a connect */ +static void ceph_data_ready(struct sock *sk, int count_unused) +{ + struct ceph_connection *con = + (struct ceph_connection *)sk->sk_user_data; + if (sk->sk_state != TCP_CLOSE_WAIT) { + dout("ceph_data_ready on %p state = %lu, queueing work\n", + con, con->state); + queue_con(con); + } +} + +/* socket has buffer space for writing */ +static void ceph_write_space(struct sock *sk) +{ + struct ceph_connection *con = + (struct ceph_connection *)sk->sk_user_data; + + /* only queue to workqueue if there is data we want to write. */ + if (test_bit(WRITE_PENDING, &con->state)) { + dout("ceph_write_space %p queueing write work\n", con); + queue_con(con); + } else { + dout("ceph_write_space %p nothing to write\n", con); + } + + /* since we have our own write_space, clear the SOCK_NOSPACE flag */ + clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); +} + +/* socket's state has changed */ +static void ceph_state_change(struct sock *sk) +{ + struct ceph_connection *con = + (struct ceph_connection *)sk->sk_user_data; + + dout("ceph_state_change %p state = %lu sk_state = %u\n", + con, con->state, sk->sk_state); + + if (test_bit(CLOSED, &con->state)) + return; + + switch (sk->sk_state) { + case TCP_CLOSE: + dout("ceph_state_change TCP_CLOSE\n"); + case TCP_CLOSE_WAIT: + dout("ceph_state_change TCP_CLOSE_WAIT\n"); + if (test_and_set_bit(SOCK_CLOSED, &con->state) == 0) { + if (test_bit(CONNECTING, &con->state)) + con->error_msg = "connection failed"; + else + con->error_msg = "socket closed"; + queue_con(con); + } + break; + case TCP_ESTABLISHED: + dout("ceph_state_change TCP_ESTABLISHED\n"); + queue_con(con); + break; + } +} + +/* + * set up socket callbacks + */ +static void set_sock_callbacks(struct socket *sock, + struct ceph_connection *con) +{ + struct sock *sk = sock->sk; + sk->sk_user_data = (void *)con; + sk->sk_data_ready = ceph_data_ready; + sk->sk_write_space = ceph_write_space; + sk->sk_state_change = ceph_state_change; +} + + +/* + * socket helpers + */ + +/* + * initiate connection to a remote socket. + */ +static struct socket *ceph_tcp_connect(struct ceph_connection *con) +{ + struct sockaddr *paddr = (struct sockaddr *)&con->peer_addr.in_addr; + struct socket *sock; + int ret; + + BUG_ON(con->sock); + ret = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); + if (ret) + return ERR_PTR(ret); + con->sock = sock; + sock->sk->sk_allocation = GFP_NOFS; + + set_sock_callbacks(sock, con); + + dout("connect %s\n", pr_addr(&con->peer_addr.in_addr)); + + ret = sock->ops->connect(sock, paddr, sizeof(*paddr), O_NONBLOCK); + if (ret == -EINPROGRESS) { + dout("connect %s EINPROGRESS sk_state = %u\n", + pr_addr(&con->peer_addr.in_addr), + sock->sk->sk_state); + ret = 0; + } + if (ret < 0) { + pr_err("connect %s error %d\n", + pr_addr(&con->peer_addr.in_addr), ret); + sock_release(sock); + con->sock = NULL; + con->error_msg = "connect error"; + } + + if (ret < 0) + return ERR_PTR(ret); + return sock; +} + +static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len) +{ + struct kvec iov = {buf, len}; + struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL }; + + return kernel_recvmsg(sock, &msg, &iov, 1, len, msg.msg_flags); +} + +/* + * write something. @more is true if caller will be sending more data + * shortly. + */ +static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov, + size_t kvlen, size_t len, int more) +{ + struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL }; + + if (more) + msg.msg_flags |= MSG_MORE; + else + msg.msg_flags |= MSG_EOR; /* superfluous, but what the hell */ + + return kernel_sendmsg(sock, &msg, iov, kvlen, len); +} + + +/* + * Shutdown/close the socket for the given connection. + */ +static int con_close_socket(struct ceph_connection *con) +{ + int rc; + + dout("con_close_socket on %p sock %p\n", con, con->sock); + if (!con->sock) + return 0; + set_bit(SOCK_CLOSED, &con->state); + rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR); + sock_release(con->sock); + con->sock = NULL; + clear_bit(SOCK_CLOSED, &con->state); + return rc; +} + +/* + * Reset a connection. Discard all incoming and outgoing messages + * and clear *_seq state. + */ +static void ceph_msg_remove(struct ceph_msg *msg) +{ + list_del_init(&msg->list_head); + ceph_msg_put(msg); +} +static void ceph_msg_remove_list(struct list_head *head) +{ + while (!list_empty(head)) { + struct ceph_msg *msg = list_first_entry(head, struct ceph_msg, + list_head); + ceph_msg_remove(msg); + } +} + +static void reset_connection(struct ceph_connection *con) +{ + /* reset connection, out_queue, msg_ and connect_seq */ + /* discard existing out_queue and msg_seq */ + mutex_lock(&con->out_mutex); + ceph_msg_remove_list(&con->out_queue); + ceph_msg_remove_list(&con->out_sent); + + con->connect_seq = 0; + con->out_seq = 0; + con->out_msg = NULL; + con->in_seq = 0; + mutex_unlock(&con->out_mutex); +} + +/* + * mark a peer down. drop any open connections. + */ +void ceph_con_close(struct ceph_connection *con) +{ + dout("con_close %p peer %s\n", con, pr_addr(&con->peer_addr.in_addr)); + set_bit(CLOSED, &con->state); /* in case there's queued work */ + clear_bit(STANDBY, &con->state); /* avoid connect_seq bump */ + reset_connection(con); + queue_con(con); +} + +/* + * clean up connection state + */ +void ceph_con_shutdown(struct ceph_connection *con) +{ + dout("con_shutdown %p\n", con); + reset_connection(con); + set_bit(DEAD, &con->state); + con_close_socket(con); /* silently ignore errors */ +} + +/* + * Reopen a closed connection, with a new peer address. + */ +void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr) +{ + dout("con_open %p %s\n", con, pr_addr(&addr->in_addr)); + set_bit(OPENING, &con->state); + clear_bit(CLOSED, &con->state); + memcpy(&con->peer_addr, addr, sizeof(*addr)); + queue_con(con); +} + +/* + * generic get/put + */ +struct ceph_connection *ceph_con_get(struct ceph_connection *con) +{ + dout("con_get %p nref = %d -> %d\n", con, + atomic_read(&con->nref), atomic_read(&con->nref) + 1); + if (atomic_inc_not_zero(&con->nref)) + return con; + return NULL; +} + +void ceph_con_put(struct ceph_connection *con) +{ + dout("con_put %p nref = %d -> %d\n", con, + atomic_read(&con->nref), atomic_read(&con->nref) - 1); + BUG_ON(atomic_read(&con->nref) == 0); + if (atomic_dec_and_test(&con->nref)) { + ceph_con_shutdown(con); + kfree(con); + } +} + +/* + * initialize a new connection. + */ +void ceph_con_init(struct ceph_messenger *msgr, struct ceph_connection *con) +{ + dout("con_init %p\n", con); + memset(con, 0, sizeof(*con)); + atomic_set(&con->nref, 1); + con->msgr = msgr; + mutex_init(&con->out_mutex); + INIT_LIST_HEAD(&con->out_queue); + INIT_LIST_HEAD(&con->out_sent); + INIT_DELAYED_WORK(&con->work, con_work); +} + + +/* + * We maintain a global counter to order connection attempts. Get + * a unique seq greater than @gt. + */ +static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt) +{ + u32 ret; + + spin_lock(&msgr->global_seq_lock); + if (msgr->global_seq < gt) + msgr->global_seq = gt; + ret = ++msgr->global_seq; + spin_unlock(&msgr->global_seq_lock); + return ret; +} + + +/* + * Prepare footer for currently outgoing message, and finish things + * off. Assumes out_kvec* are already valid.. we just add on to the end. + */ +static void prepare_write_message_footer(struct ceph_connection *con, int v) +{ + struct ceph_msg *m = con->out_msg; + + dout("prepare_write_message_footer %p\n", con); + con->out_kvec_is_msg = true; + con->out_kvec[v].iov_base = &m->footer; + con->out_kvec[v].iov_len = sizeof(m->footer); + con->out_kvec_bytes += sizeof(m->footer); + con->out_kvec_left++; + con->out_more = m->more_to_follow; + con->out_msg = NULL; /* we're done with this one */ +} + +/* + * Prepare headers for the next outgoing message. + */ +static void prepare_write_message(struct ceph_connection *con) +{ + struct ceph_msg *m; + int v = 0; + + con->out_kvec_bytes = 0; + con->out_kvec_is_msg = true; + + /* Sneak an ack in there first? If we can get it into the same + * TCP packet that's a good thing. */ + if (con->in_seq > con->in_seq_acked) { + con->in_seq_acked = con->in_seq; + con->out_kvec[v].iov_base = &tag_ack; + con->out_kvec[v++].iov_len = 1; + con->out_temp_ack = cpu_to_le64(con->in_seq_acked); + con->out_kvec[v].iov_base = &con->out_temp_ack; + con->out_kvec[v++].iov_len = sizeof(con->out_temp_ack); + con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack); + } + + /* move message to sending/sent list */ + m = list_first_entry(&con->out_queue, + struct ceph_msg, list_head); + list_move_tail(&m->list_head, &con->out_sent); + con->out_msg = m; /* we don't bother taking a reference here. */ + + m->hdr.seq = cpu_to_le64(++con->out_seq); + + dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n", + m, con->out_seq, le16_to_cpu(m->hdr.type), + le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len), + le32_to_cpu(m->hdr.data_len), + m->nr_pages); + BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len); + + /* tag + hdr + front + middle */ + con->out_kvec[v].iov_base = &tag_msg; + con->out_kvec[v++].iov_len = 1; + con->out_kvec[v].iov_base = &m->hdr; + con->out_kvec[v++].iov_len = sizeof(m->hdr); + con->out_kvec[v++] = m->front; + if (m->middle) + con->out_kvec[v++] = m->middle->vec; + con->out_kvec_left = v; + con->out_kvec_bytes += 1 + sizeof(m->hdr) + m->front.iov_len + + (m->middle ? m->middle->vec.iov_len : 0); + con->out_kvec_cur = con->out_kvec; + + /* fill in crc (except data pages), footer */ + con->out_msg->hdr.crc = + cpu_to_le32(crc32c(0, (void *)&m->hdr, + sizeof(m->hdr) - sizeof(m->hdr.crc))); + con->out_msg->footer.flags = CEPH_MSG_FOOTER_COMPLETE; + con->out_msg->footer.front_crc = + cpu_to_le32(crc32c(0, m->front.iov_base, m->front.iov_len)); + if (m->middle) + con->out_msg->footer.middle_crc = + cpu_to_le32(crc32c(0, m->middle->vec.iov_base, + m->middle->vec.iov_len)); + else + con->out_msg->footer.middle_crc = 0; + con->out_msg->footer.data_crc = 0; + dout("prepare_write_message front_crc %u data_crc %u\n", + le32_to_cpu(con->out_msg->footer.front_crc), + le32_to_cpu(con->out_msg->footer.middle_crc)); + + /* is there a data payload? */ + if (le32_to_cpu(m->hdr.data_len) > 0) { + /* initialize page iterator */ + con->out_msg_pos.page = 0; + con->out_msg_pos.page_pos = + le16_to_cpu(m->hdr.data_off) & ~PAGE_MASK; + con->out_msg_pos.data_pos = 0; + con->out_msg_pos.did_page_crc = 0; + con->out_more = 1; /* data + footer will follow */ + } else { + /* no, queue up footer too and be done */ + prepare_write_message_footer(con, v); + } + + set_bit(WRITE_PENDING, &con->state); +} + +/* + * Prepare an ack. + */ +static void prepare_write_ack(struct ceph_connection *con) +{ + dout("prepare_write_ack %p %llu -> %llu\n", con, + con->in_seq_acked, con->in_seq); + con->in_seq_acked = con->in_seq; + + con->out_kvec[0].iov_base = &tag_ack; + con->out_kvec[0].iov_len = 1; + con->out_temp_ack = cpu_to_le64(con->in_seq_acked); + con->out_kvec[1].iov_base = &con->out_temp_ack; + con->out_kvec[1].iov_len = sizeof(con->out_temp_ack); + con->out_kvec_left = 2; + con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack); + con->out_kvec_cur = con->out_kvec; + con->out_more = 1; /* more will follow.. eventually.. */ + set_bit(WRITE_PENDING, &con->state); +} + +/* + * Prepare to write keepalive byte. + */ +static void prepare_write_keepalive(struct ceph_connection *con) +{ + dout("prepare_write_keepalive %p\n", con); + con->out_kvec[0].iov_base = &tag_keepalive; + con->out_kvec[0].iov_len = 1; + con->out_kvec_left = 1; + con->out_kvec_bytes = 1; + con->out_kvec_cur = con->out_kvec; + set_bit(WRITE_PENDING, &con->state); +} + +/* + * Connection negotiation. + */ + +/* + * We connected to a peer and are saying hello. + */ +static void prepare_write_connect(struct ceph_messenger *msgr, + struct ceph_connection *con) +{ + int len = strlen(CEPH_BANNER); + unsigned global_seq = get_global_seq(con->msgr, 0); + int proto; + + switch (con->peer_name.type) { + case CEPH_ENTITY_TYPE_MON: + proto = CEPH_MONC_PROTOCOL; + break; + case CEPH_ENTITY_TYPE_OSD: + proto = CEPH_OSDC_PROTOCOL; + break; + case CEPH_ENTITY_TYPE_MDS: + proto = CEPH_MDSC_PROTOCOL; + break; + default: + BUG(); + } + + dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con, + con->connect_seq, global_seq, proto); + con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT); + con->out_connect.connect_seq = cpu_to_le32(con->connect_seq); + con->out_connect.global_seq = cpu_to_le32(global_seq); + con->out_connect.protocol_version = cpu_to_le32(proto); + con->out_connect.flags = 0; + if (test_bit(LOSSYTX, &con->state)) + con->out_connect.flags = CEPH_MSG_CONNECT_LOSSY; + + con->out_kvec[0].iov_base = CEPH_BANNER; + con->out_kvec[0].iov_len = len; + con->out_kvec[1].iov_base = &msgr->inst.addr; + con->out_kvec[1].iov_len = sizeof(msgr->inst.addr); + con->out_kvec[2].iov_base = &con->out_connect; + con->out_kvec[2].iov_len = sizeof(con->out_connect); + con->out_kvec_left = 3; + con->out_kvec_bytes = len + sizeof(msgr->inst.addr) + + sizeof(con->out_connect); + con->out_kvec_cur = con->out_kvec; + con->out_more = 0; + set_bit(WRITE_PENDING, &con->state); +} + +static void prepare_write_connect_retry(struct ceph_messenger *msgr, + struct ceph_connection *con) +{ + dout("prepare_write_connect_retry %p\n", con); + con->out_connect.connect_seq = cpu_to_le32(con->connect_seq); + con->out_connect.global_seq = + cpu_to_le32(get_global_seq(con->msgr, 0)); + + con->out_kvec[0].iov_base = &con->out_connect; + con->out_kvec[0].iov_len = sizeof(con->out_connect); + con->out_kvec_left = 1; + con->out_kvec_bytes = sizeof(con->out_connect); + con->out_kvec_cur = con->out_kvec; + con->out_more = 0; + set_bit(WRITE_PENDING, &con->state); +} + + +/* + * write as much of pending kvecs to the socket as we can. + * 1 -> done + * 0 -> socket full, but more to do + * <0 -> error + */ +static int write_partial_kvec(struct ceph_connection *con) +{ + int ret; + + dout("write_partial_kvec %p %d left\n", con, con->out_kvec_bytes); + while (con->out_kvec_bytes > 0) { + ret = ceph_tcp_sendmsg(con->sock, con->out_kvec_cur, + con->out_kvec_left, con->out_kvec_bytes, + con->out_more); + if (ret <= 0) + goto out; + con->out_kvec_bytes -= ret; + if (con->out_kvec_bytes == 0) + break; /* done */ + while (ret > 0) { + if (ret >= con->out_kvec_cur->iov_len) { + ret -= con->out_kvec_cur->iov_len; + con->out_kvec_cur++; + con->out_kvec_left--; + } else { + con->out_kvec_cur->iov_len -= ret; + con->out_kvec_cur->iov_base += ret; + ret = 0; + break; + } + } + } + con->out_kvec_left = 0; + con->out_kvec_is_msg = false; + ret = 1; +out: + dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con, + con->out_kvec_bytes, con->out_kvec_left, ret); + return ret; /* done! */ +} + +/* + * Write as much message data payload as we can. If we finish, queue + * up the footer. + * 1 -> done, footer is now queued in out_kvec[]. + * 0 -> socket full, but more to do + * <0 -> error + */ +static int write_partial_msg_pages(struct ceph_connection *con) +{ + struct ceph_msg *msg = con->out_msg; + unsigned data_len = le32_to_cpu(msg->hdr.data_len); + size_t len; + int crc = con->msgr->nocrc; + int ret; + + dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n", + con, con->out_msg, con->out_msg_pos.page, con->out_msg->nr_pages, + con->out_msg_pos.page_pos); + + while (con->out_msg_pos.page < con->out_msg->nr_pages) { + struct page *page = NULL; + void *kaddr = NULL; + + /* + * if we are calculating the data crc (the default), we need + * to map the page. if our pages[] has been revoked, use the + * zero page. + */ + if (msg->pages) { + page = msg->pages[con->out_msg_pos.page]; + if (crc) + kaddr = kmap(page); + } else { + page = con->msgr->zero_page; + if (crc) + kaddr = page_address(con->msgr->zero_page); + } + len = min((int)(PAGE_SIZE - con->out_msg_pos.page_pos), + (int)(data_len - con->out_msg_pos.data_pos)); + if (crc && !con->out_msg_pos.did_page_crc) { + void *base = kaddr + con->out_msg_pos.page_pos; + u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc); + + BUG_ON(kaddr == NULL); + con->out_msg->footer.data_crc = + cpu_to_le32(crc32c(tmpcrc, base, len)); + con->out_msg_pos.did_page_crc = 1; + } + + ret = kernel_sendpage(con->sock, page, + con->out_msg_pos.page_pos, len, + MSG_DONTWAIT | MSG_NOSIGNAL | + MSG_MORE); + + if (crc && msg->pages) + kunmap(page); + + if (ret <= 0) + goto out; + + con->out_msg_pos.data_pos += ret; + con->out_msg_pos.page_pos += ret; + if (ret == len) { + con->out_msg_pos.page_pos = 0; + con->out_msg_pos.page++; + con->out_msg_pos.did_page_crc = 0; + } + } + + dout("write_partial_msg_pages %p msg %p done\n", con, msg); + + /* prepare and queue up footer, too */ + if (!crc) + con->out_msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC; + con->out_kvec_bytes = 0; + con->out_kvec_left = 0; + con->out_kvec_cur = con->out_kvec; + prepare_write_message_footer(con, 0); + ret = 1; +out: + return ret; +} + +/* + * write some zeros + */ +static int write_partial_skip(struct ceph_connection *con) +{ + int ret; + + while (con->out_skip > 0) { + struct kvec iov = { + .iov_base = page_address(con->msgr->zero_page), + .iov_len = min(con->out_skip, (int)PAGE_CACHE_SIZE) + }; + + ret = ceph_tcp_sendmsg(con->sock, &iov, 1, iov.iov_len, 1); + if (ret <= 0) + goto out; + con->out_skip -= ret; + } + ret = 1; +out: + return ret; +} + +/* + * Prepare to read connection handshake, or an ack. + */ +static void prepare_read_connect(struct ceph_connection *con) +{ + dout("prepare_read_connect %p\n", con); + con->in_base_pos = 0; +} + +static void prepare_read_ack(struct ceph_connection *con) +{ + dout("prepare_read_ack %p\n", con); + con->in_base_pos = 0; +} + +static void prepare_read_tag(struct ceph_connection *con) +{ + dout("prepare_read_tag %p\n", con); + con->in_base_pos = 0; + con->in_tag = CEPH_MSGR_TAG_READY; +} + +/* + * Prepare to read a message. + */ +static int prepare_read_message(struct ceph_connection *con) +{ + dout("prepare_read_message %p\n", con); + BUG_ON(con->in_msg != NULL); + con->in_base_pos = 0; + con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0; + return 0; +} + + +static int read_partial(struct ceph_connection *con, + int *to, int size, void *object) +{ + *to += size; + while (con->in_base_pos < *to) { + int left = *to - con->in_base_pos; + int have = size - left; + int ret = ceph_tcp_recvmsg(con->sock, object + have, left); + if (ret <= 0) + return ret; + con->in_base_pos += ret; + } + return 1; +} + + +/* + * Read all or part of the connect-side handshake on a new connection + */ +static int read_partial_connect(struct ceph_connection *con) +{ + int ret, to = 0; + + dout("read_partial_connect %p at %d\n", con, con->in_base_pos); + + /* peer's banner */ + ret = read_partial(con, &to, strlen(CEPH_BANNER), con->in_banner); + if (ret <= 0) + goto out; + ret = read_partial(con, &to, sizeof(con->actual_peer_addr), + &con->actual_peer_addr); + if (ret <= 0) + goto out; + ret = read_partial(con, &to, sizeof(con->peer_addr_for_me), + &con->peer_addr_for_me); + if (ret <= 0) + goto out; + ret = read_partial(con, &to, sizeof(con->in_reply), &con->in_reply); + if (ret <= 0) + goto out; + + dout("read_partial_connect %p connect_seq = %u, global_seq = %u\n", + con, le32_to_cpu(con->in_reply.connect_seq), + le32_to_cpu(con->in_reply.global_seq)); +out: + return ret; +} + +/* + * Verify the hello banner looks okay. + */ +static int verify_hello(struct ceph_connection *con) +{ + if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) { + pr_err("connect to/from %s has bad banner\n", + pr_addr(&con->peer_addr.in_addr)); + con->error_msg = "protocol error, bad banner"; + return -1; + } + return 0; +} + +static bool addr_is_blank(struct sockaddr_storage *ss) +{ + switch (ss->ss_family) { + case AF_INET: + return ((struct sockaddr_in *)ss)->sin_addr.s_addr == 0; + case AF_INET6: + return + ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[0] == 0 && + ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[1] == 0 && + ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[2] == 0 && + ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[3] == 0; + } + return false; +} + +static int addr_port(struct sockaddr_storage *ss) +{ + switch (ss->ss_family) { + case AF_INET: + return ((struct sockaddr_in *)ss)->sin_port; + case AF_INET6: + return ((struct sockaddr_in6 *)ss)->sin6_port; + } + return 0; +} + +static void addr_set_port(struct sockaddr_storage *ss, int p) +{ + switch (ss->ss_family) { + case AF_INET: + ((struct sockaddr_in *)ss)->sin_port = htons(p); + case AF_INET6: + ((struct sockaddr_in6 *)ss)->sin6_port = htons(p); + } +} + +/* + * Parse an ip[:port] list into an addr array. Use the default + * monitor port if a port isn't specified. + */ +int ceph_parse_ips(const char *c, const char *end, + struct ceph_entity_addr *addr, + int max_count, int *count) +{ + int i; + const char *p = c; + + dout("parse_ips on '%.*s'\n", (int)(end-c), c); + for (i = 0; i < max_count; i++) { + const char *ipend; + struct sockaddr_storage *ss = &addr[i].in_addr; + struct sockaddr_in *in4 = (void *)ss; + struct sockaddr_in6 *in6 = (void *)ss; + int port; + + memset(ss, 0, sizeof(*ss)); + if (in4_pton(p, end - p, (u8 *)&in4->sin_addr.s_addr, + ',', &ipend)) { + ss->ss_family = AF_INET; + } else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr, + ',', &ipend)) { + ss->ss_family = AF_INET6; + } else { + goto bad; + } + p = ipend; + + /* port? */ + if (p < end && *p == ':') { + port = 0; + p++; + while (p < end && *p >= '0' && *p <= '9') { + port = (port * 10) + (*p - '0'); + p++; + } + if (port > 65535 || port == 0) + goto bad; + } else { + port = CEPH_MON_PORT; + } + + addr_set_port(ss, port); + + dout("parse_ips got %s\n", pr_addr(ss)); + + if (p == end) + break; + if (*p != ',') + goto bad; + p++; + } + + if (p != end) + goto bad; + + if (count) + *count = i + 1; + return 0; + +bad: + pr_err("parse_ips bad ip '%s'\n", c); + return -EINVAL; +} + +static int process_connect(struct ceph_connection *con) +{ + dout("process_connect on %p tag %d\n", con, (int)con->in_tag); + + if (verify_hello(con) < 0) + return -1; + + /* + * Make sure the other end is who we wanted. note that the other + * end may not yet know their ip address, so if it's 0.0.0.0, give + * them the benefit of the doubt. + */ + if (!ceph_entity_addr_is_local(&con->peer_addr, + &con->actual_peer_addr) && + !(addr_is_blank(&con->actual_peer_addr.in_addr) && + con->actual_peer_addr.nonce == con->peer_addr.nonce)) { + pr_err("wrong peer, want %s/%d, " + "got %s/%d, wtf\n", + pr_addr(&con->peer_addr.in_addr), + con->peer_addr.nonce, + pr_addr(&con->actual_peer_addr.in_addr), + con->actual_peer_addr.nonce); + con->error_msg = "protocol error, wrong peer"; + return -1; + } + + /* + * did we learn our address? + */ + if (addr_is_blank(&con->msgr->inst.addr.in_addr)) { + int port = addr_port(&con->msgr->inst.addr.in_addr); + + memcpy(&con->msgr->inst.addr.in_addr, + &con->peer_addr_for_me.in_addr, + sizeof(con->peer_addr_for_me.in_addr)); + addr_set_port(&con->msgr->inst.addr.in_addr, port); + dout("process_connect learned my addr is %s\n", + pr_addr(&con->msgr->inst.addr.in_addr)); + } + + switch (con->in_reply.tag) { + case CEPH_MSGR_TAG_BADPROTOVER: + dout("process_connect got BADPROTOVER my %d != their %d\n", + le32_to_cpu(con->out_connect.protocol_version), + le32_to_cpu(con->in_reply.protocol_version)); + pr_err("%s%lld %s protocol version mismatch," + " my %d != server's %d\n", + ENTITY_NAME(con->peer_name), + pr_addr(&con->peer_addr.in_addr), + le32_to_cpu(con->out_connect.protocol_version), + le32_to_cpu(con->in_reply.protocol_version)); + con->error_msg = "protocol version mismatch"; + if (con->ops->bad_proto) + con->ops->bad_proto(con); + reset_connection(con); + set_bit(CLOSED, &con->state); /* in case there's queued work */ + return -1; + + + case CEPH_MSGR_TAG_RESETSESSION: + /* + * If we connected with a large connect_seq but the peer + * has no record of a session with us (no connection, or + * connect_seq == 0), they will send RESETSESION to indicate + * that they must have reset their session, and may have + * dropped messages. + */ + dout("process_connect got RESET peer seq %u\n", + le32_to_cpu(con->in_connect.connect_seq)); + pr_err("%s%lld %s connection reset\n", + ENTITY_NAME(con->peer_name), + pr_addr(&con->peer_addr.in_addr)); + reset_connection(con); + prepare_write_connect_retry(con->msgr, con); + prepare_read_connect(con); + + /* Tell ceph about it. */ + pr_info("reset on %s%lld\n", ENTITY_NAME(con->peer_name)); + if (con->ops->peer_reset) + con->ops->peer_reset(con); + break; + + case CEPH_MSGR_TAG_RETRY_SESSION: + /* + * If we sent a smaller connect_seq than the peer has, try + * again with a larger value. + */ + dout("process_connect got RETRY my seq = %u, peer_seq = %u\n", + le32_to_cpu(con->out_connect.connect_seq), + le32_to_cpu(con->in_connect.connect_seq)); + con->connect_seq = le32_to_cpu(con->in_connect.connect_seq); + prepare_write_connect_retry(con->msgr, con); + prepare_read_connect(con); + break; + + case CEPH_MSGR_TAG_RETRY_GLOBAL: + /* + * If we sent a smaller global_seq than the peer has, try + * again with a larger value. + */ + dout("process_connect got RETRY_GLOBAL my %u, peer_gseq = %u\n", + con->peer_global_seq, + le32_to_cpu(con->in_connect.global_seq)); + get_global_seq(con->msgr, + le32_to_cpu(con->in_connect.global_seq)); + prepare_write_connect_retry(con->msgr, con); + prepare_read_connect(con); + break; + + case CEPH_MSGR_TAG_READY: + clear_bit(CONNECTING, &con->state); + if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY) + set_bit(LOSSYRX, &con->state); + con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq); + con->connect_seq++; + dout("process_connect got READY gseq %d cseq %d (%d)\n", + con->peer_global_seq, + le32_to_cpu(con->in_reply.connect_seq), + con->connect_seq); + WARN_ON(con->connect_seq != + le32_to_cpu(con->in_reply.connect_seq)); + + con->delay = 0; /* reset backoff memory */ + prepare_read_tag(con); + break; + + case CEPH_MSGR_TAG_WAIT: + /* + * If there is a connection race (we are opening + * connections to each other), one of us may just have + * to WAIT. This shouldn't happen if we are the + * client. + */ + pr_err("process_connect peer connecting WAIT\n"); + + default: + pr_err("connect protocol error, will retry\n"); + con->error_msg = "protocol error, garbage tag during connect"; + return -1; + } + return 0; +} + + +/* + * read (part of) an ack + */ +static int read_partial_ack(struct ceph_connection *con) +{ + int to = 0; + + return read_partial(con, &to, sizeof(con->in_temp_ack), + &con->in_temp_ack); +} + + +/* + * We can finally discard anything that's been acked. + */ +static void process_ack(struct ceph_connection *con) +{ + struct ceph_msg *m; + u64 ack = le64_to_cpu(con->in_temp_ack); + u64 seq; + + mutex_lock(&con->out_mutex); + while (!list_empty(&con->out_sent)) { + m = list_first_entry(&con->out_sent, struct ceph_msg, + list_head); + seq = le64_to_cpu(m->hdr.seq); + if (seq > ack) + break; + dout("got ack for seq %llu type %d at %p\n", seq, + le16_to_cpu(m->hdr.type), m); + ceph_msg_remove(m); + } + mutex_unlock(&con->out_mutex); + prepare_read_tag(con); +} + + + + + + +/* + * read (part of) a message. + */ +static int read_partial_message(struct ceph_connection *con) +{ + struct ceph_msg *m = con->in_msg; + void *p; + int ret; + int to, want, left; + unsigned front_len, middle_len, data_len, data_off; + int datacrc = con->msgr->nocrc; + + dout("read_partial_message con %p msg %p\n", con, m); + + /* header */ + while (con->in_base_pos < sizeof(con->in_hdr)) { + left = sizeof(con->in_hdr) - con->in_base_pos; + ret = ceph_tcp_recvmsg(con->sock, + (char *)&con->in_hdr + con->in_base_pos, + left); + if (ret <= 0) + return ret; + con->in_base_pos += ret; + if (con->in_base_pos == sizeof(con->in_hdr)) { + u32 crc = crc32c(0, (void *)&con->in_hdr, + sizeof(con->in_hdr) - sizeof(con->in_hdr.crc)); + if (crc != le32_to_cpu(con->in_hdr.crc)) { + pr_err("read_partial_message bad hdr " + " crc %u != expected %u\n", + crc, con->in_hdr.crc); + return -EBADMSG; + } + } + } + + front_len = le32_to_cpu(con->in_hdr.front_len); + if (front_len > CEPH_MSG_MAX_FRONT_LEN) + return -EIO; + middle_len = le32_to_cpu(con->in_hdr.middle_len); + if (middle_len > CEPH_MSG_MAX_DATA_LEN) + return -EIO; + data_len = le32_to_cpu(con->in_hdr.data_len); + if (data_len > CEPH_MSG_MAX_DATA_LEN) + return -EIO; + + /* allocate message? */ + if (!con->in_msg) { + dout("got hdr type %d front %d data %d\n", con->in_hdr.type, + con->in_hdr.front_len, con->in_hdr.data_len); + con->in_msg = con->ops->alloc_msg(con, &con->in_hdr); + if (!con->in_msg) { + /* skip this message */ + dout("alloc_msg returned NULL, skipping message\n"); + con->in_base_pos = -front_len - middle_len - data_len - + sizeof(m->footer); + con->in_tag = CEPH_MSGR_TAG_READY; + return 0; + } + if (IS_ERR(con->in_msg)) { + ret = PTR_ERR(con->in_msg); + con->in_msg = NULL; + con->error_msg = "out of memory for incoming message"; + return ret; + } + m = con->in_msg; + m->front.iov_len = 0; /* haven't read it yet */ + memcpy(&m->hdr, &con->in_hdr, sizeof(con->in_hdr)); + } + + /* front */ + while (m->front.iov_len < front_len) { + BUG_ON(m->front.iov_base == NULL); + left = front_len - m->front.iov_len; + ret = ceph_tcp_recvmsg(con->sock, (char *)m->front.iov_base + + m->front.iov_len, left); + if (ret <= 0) + return ret; + m->front.iov_len += ret; + if (m->front.iov_len == front_len) + con->in_front_crc = crc32c(0, m->front.iov_base, + m->front.iov_len); + } + + /* middle */ + while (middle_len > 0 && (!m->middle || + m->middle->vec.iov_len < middle_len)) { + if (m->middle == NULL) { + ret = -EOPNOTSUPP; + if (con->ops->alloc_middle) + ret = con->ops->alloc_middle(con, m); + if (ret < 0) { + dout("alloc_middle failed, skipping payload\n"); + con->in_base_pos = -middle_len - data_len + - sizeof(m->footer); + ceph_msg_put(con->in_msg); + con->in_msg = NULL; + con->in_tag = CEPH_MSGR_TAG_READY; + return 0; + } + m->middle->vec.iov_len = 0; + } + left = middle_len - m->middle->vec.iov_len; + ret = ceph_tcp_recvmsg(con->sock, + (char *)m->middle->vec.iov_base + + m->middle->vec.iov_len, left); + if (ret <= 0) + return ret; + m->middle->vec.iov_len += ret; + if (m->middle->vec.iov_len == middle_len) + con->in_middle_crc = crc32c(0, m->middle->vec.iov_base, + m->middle->vec.iov_len); + } + + /* (page) data */ + data_off = le16_to_cpu(m->hdr.data_off); + if (data_len == 0) + goto no_data; + + if (m->nr_pages == 0) { + con->in_msg_pos.page = 0; + con->in_msg_pos.page_pos = data_off & ~PAGE_MASK; + con->in_msg_pos.data_pos = 0; + /* find pages for data payload */ + want = calc_pages_for(data_off & ~PAGE_MASK, data_len); + ret = -1; + if (con->ops->prepare_pages) + ret = con->ops->prepare_pages(con, m, want); + if (ret < 0) { + dout("%p prepare_pages failed, skipping payload\n", m); + con->in_base_pos = -data_len - sizeof(m->footer); + ceph_msg_put(con->in_msg); + con->in_msg = NULL; + con->in_tag = CEPH_MSGR_TAG_READY; + return 0; + } + BUG_ON(m->nr_pages < want); + } + while (con->in_msg_pos.data_pos < data_len) { + left = min((int)(data_len - con->in_msg_pos.data_pos), + (int)(PAGE_SIZE - con->in_msg_pos.page_pos)); + BUG_ON(m->pages == NULL); + p = kmap(m->pages[con->in_msg_pos.page]); + ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos, + left); + if (ret > 0 && datacrc) + con->in_data_crc = + crc32c(con->in_data_crc, + p + con->in_msg_pos.page_pos, ret); + kunmap(m->pages[con->in_msg_pos.page]); + if (ret <= 0) + return ret; + con->in_msg_pos.data_pos += ret; + con->in_msg_pos.page_pos += ret; + if (con->in_msg_pos.page_pos == PAGE_SIZE) { + con->in_msg_pos.page_pos = 0; + con->in_msg_pos.page++; + } + } + +no_data: + /* footer */ + to = sizeof(m->hdr) + sizeof(m->footer); + while (con->in_base_pos < to) { + left = to - con->in_base_pos; + ret = ceph_tcp_recvmsg(con->sock, (char *)&m->footer + + (con->in_base_pos - sizeof(m->hdr)), + left); + if (ret <= 0) + return ret; + con->in_base_pos += ret; + } + dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n", + m, front_len, m->footer.front_crc, middle_len, + m->footer.middle_crc, data_len, m->footer.data_crc); + + /* crc ok? */ + if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) { + pr_err("read_partial_message %p front crc %u != exp. %u\n", + m, con->in_front_crc, m->footer.front_crc); + return -EBADMSG; + } + if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) { + pr_err("read_partial_message %p middle crc %u != exp %u\n", + m, con->in_middle_crc, m->footer.middle_crc); + return -EBADMSG; + } + if (datacrc && + (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 && + con->in_data_crc != le32_to_cpu(m->footer.data_crc)) { + pr_err("read_partial_message %p data crc %u != exp. %u\n", m, + con->in_data_crc, le32_to_cpu(m->footer.data_crc)); + return -EBADMSG; + } + + return 1; /* done! */ +} + +/* + * Process message. This happens in the worker thread. The callback should + * be careful not to do anything that waits on other incoming messages or it + * may deadlock. + */ +static void process_message(struct ceph_connection *con) +{ + struct ceph_msg *msg = con->in_msg; + + con->in_msg = NULL; + + /* if first message, set peer_name */ + if (con->peer_name.type == 0) + con->peer_name = msg->hdr.src.name; + + mutex_lock(&con->out_mutex); + con->in_seq++; + mutex_unlock(&con->out_mutex); + + dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n", + msg, le64_to_cpu(msg->hdr.seq), + ENTITY_NAME(msg->hdr.src.name), + le16_to_cpu(msg->hdr.type), + ceph_msg_type_name(le16_to_cpu(msg->hdr.type)), + le32_to_cpu(msg->hdr.front_len), + le32_to_cpu(msg->hdr.data_len), + con->in_front_crc, con->in_middle_crc, con->in_data_crc); + con->ops->dispatch(con, msg); + prepare_read_tag(con); +} + + +/* + * Write something to the socket. Called in a worker thread when the + * socket appears to be writeable and we have something ready to send. + */ +static int try_write(struct ceph_connection *con) +{ + struct ceph_messenger *msgr = con->msgr; + int ret = 1; + + dout("try_write start %p state %lu nref %d\n", con, con->state, + atomic_read(&con->nref)); + + mutex_lock(&con->out_mutex); +more: + dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes); + + /* open the socket first? */ + if (con->sock == NULL) { + /* + * if we were STANDBY and are reconnecting _this_ + * connection, bump connect_seq now. Always bump + * global_seq. + */ + if (test_and_clear_bit(STANDBY, &con->state)) + con->connect_seq++; + + prepare_write_connect(msgr, con); + prepare_read_connect(con); + set_bit(CONNECTING, &con->state); + + con->in_tag = CEPH_MSGR_TAG_READY; + dout("try_write initiating connect on %p new state %lu\n", + con, con->state); + con->sock = ceph_tcp_connect(con); + if (IS_ERR(con->sock)) { + con->sock = NULL; + con->error_msg = "connect error"; + ret = -1; + goto out; + } + } + +more_kvec: + /* kvec data queued? */ + if (con->out_skip) { + ret = write_partial_skip(con); + if (ret <= 0) + goto done; + if (ret < 0) { + dout("try_write write_partial_skip err %d\n", ret); + goto done; + } + } + if (con->out_kvec_left) { + ret = write_partial_kvec(con); + if (ret <= 0) + goto done; + if (ret < 0) { + dout("try_write write_partial_kvec err %d\n", ret); + goto done; + } + } + + /* msg pages? */ + if (con->out_msg) { + ret = write_partial_msg_pages(con); + if (ret == 1) + goto more_kvec; /* we need to send the footer, too! */ + if (ret == 0) + goto done; + if (ret < 0) { + dout("try_write write_partial_msg_pages err %d\n", + ret); + goto done; + } + } + + if (!test_bit(CONNECTING, &con->state)) { + /* is anything else pending? */ + if (!list_empty(&con->out_queue)) { + prepare_write_message(con); + goto more; + } + if (con->in_seq > con->in_seq_acked) { + prepare_write_ack(con); + goto more; + } + if (test_and_clear_bit(KEEPALIVE_PENDING, &con->state)) { + prepare_write_keepalive(con); + goto more; + } + } + + /* Nothing to do! */ + clear_bit(WRITE_PENDING, &con->state); + dout("try_write nothing else to write.\n"); +done: + ret = 0; +out: + mutex_unlock(&con->out_mutex); + dout("try_write done on %p\n", con); + return ret; +} + + + +/* + * Read what we can from the socket. + */ +static int try_read(struct ceph_connection *con) +{ + struct ceph_messenger *msgr; + int ret = -1; + + if (!con->sock) + return 0; + + if (test_bit(STANDBY, &con->state)) + return 0; + + dout("try_read start on %p\n", con); + msgr = con->msgr; + +more: + dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag, + con->in_base_pos); + if (test_bit(CONNECTING, &con->state)) { + dout("try_read connecting\n"); + ret = read_partial_connect(con); + if (ret <= 0) + goto done; + if (process_connect(con) < 0) { + ret = -1; + goto out; + } + goto more; + } + + if (con->in_base_pos < 0) { + /* + * skipping + discarding content. + * + * FIXME: there must be a better way to do this! + */ + static char buf[1024]; + int skip = min(1024, -con->in_base_pos); + dout("skipping %d / %d bytes\n", skip, -con->in_base_pos); + ret = ceph_tcp_recvmsg(con->sock, buf, skip); + if (ret <= 0) + goto done; + con->in_base_pos += ret; + if (con->in_base_pos) + goto more; + } + if (con->in_tag == CEPH_MSGR_TAG_READY) { + /* + * what's next? + */ + ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1); + if (ret <= 0) + goto done; + dout("try_read got tag %d\n", (int)con->in_tag); + switch (con->in_tag) { + case CEPH_MSGR_TAG_MSG: + prepare_read_message(con); + break; + case CEPH_MSGR_TAG_ACK: + prepare_read_ack(con); + break; + case CEPH_MSGR_TAG_CLOSE: + set_bit(CLOSED, &con->state); /* fixme */ + goto done; + default: + goto bad_tag; + } + } + if (con->in_tag == CEPH_MSGR_TAG_MSG) { + ret = read_partial_message(con); + if (ret <= 0) { + switch (ret) { + case -EBADMSG: + con->error_msg = "bad crc"; + ret = -EIO; + goto out; + case -EIO: + con->error_msg = "io error"; + goto out; + default: + goto done; + } + } + if (con->in_tag == CEPH_MSGR_TAG_READY) + goto more; + process_message(con); + goto more; + } + if (con->in_tag == CEPH_MSGR_TAG_ACK) { + ret = read_partial_ack(con); + if (ret <= 0) + goto done; + process_ack(con); + goto more; + } + +done: + ret = 0; +out: + dout("try_read done on %p\n", con); + return ret; + +bad_tag: + pr_err("try_read bad con->in_tag = %d\n", (int)con->in_tag); + con->error_msg = "protocol error, garbage tag"; + ret = -1; + goto out; +} + + +/* + * Atomically queue work on a connection. Bump @con reference to + * avoid races with connection teardown. + * + * There is some trickery going on with QUEUED and BUSY because we + * only want a _single_ thread operating on each connection at any + * point in time, but we want to use all available CPUs. + * + * The worker thread only proceeds if it can atomically set BUSY. It + * clears QUEUED and does it's thing. When it thinks it's done, it + * clears BUSY, then rechecks QUEUED.. if it's set again, it loops + * (tries again to set BUSY). + * + * To queue work, we first set QUEUED, _then_ if BUSY isn't set, we + * try to queue work. If that fails (work is already queued, or BUSY) + * we give up (work also already being done or is queued) but leave QUEUED + * set so that the worker thread will loop if necessary. + */ +static void queue_con(struct ceph_connection *con) +{ + if (test_bit(DEAD, &con->state)) { + dout("queue_con %p ignoring: DEAD\n", + con); + return; + } + + if (!con->ops->get(con)) { + dout("queue_con %p ref count 0\n", con); + return; + } + + set_bit(QUEUED, &con->state); + if (test_bit(BUSY, &con->state)) { + dout("queue_con %p - already BUSY\n", con); + con->ops->put(con); + } else if (!queue_work(ceph_msgr_wq, &con->work.work)) { + dout("queue_con %p - already queued\n", con); + con->ops->put(con); + } else { + dout("queue_con %p\n", con); + } +} + +/* + * Do some work on a connection. Drop a connection ref when we're done. + */ +static void con_work(struct work_struct *work) +{ + struct ceph_connection *con = container_of(work, struct ceph_connection, + work.work); + int backoff = 0; + +more: + if (test_and_set_bit(BUSY, &con->state) != 0) { + dout("con_work %p BUSY already set\n", con); + goto out; + } + dout("con_work %p start, clearing QUEUED\n", con); + clear_bit(QUEUED, &con->state); + + if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */ + dout("con_work CLOSED\n"); + con_close_socket(con); + goto done; + } + if (test_and_clear_bit(OPENING, &con->state)) { + /* reopen w/ new peer */ + dout("con_work OPENING\n"); + con_close_socket(con); + } + + if (test_and_clear_bit(SOCK_CLOSED, &con->state) || + try_read(con) < 0 || + try_write(con) < 0) { + backoff = 1; + ceph_fault(con); /* error/fault path */ + } + +done: + clear_bit(BUSY, &con->state); + dout("con->state=%lu\n", con->state); + if (test_bit(QUEUED, &con->state)) { + if (!backoff) { + dout("con_work %p QUEUED reset, looping\n", con); + goto more; + } + dout("con_work %p QUEUED reset, but just faulted\n", con); + clear_bit(QUEUED, &con->state); + } + dout("con_work %p done\n", con); + +out: + con->ops->put(con); +} + + +/* + * Generic error/fault handler. A retry mechanism is used with + * exponential backoff + */ +static void ceph_fault(struct ceph_connection *con) +{ + pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name), + pr_addr(&con->peer_addr.in_addr), con->error_msg); + dout("fault %p state %lu to peer %s\n", + con, con->state, pr_addr(&con->peer_addr.in_addr)); + + if (test_bit(LOSSYTX, &con->state)) { + dout("fault on LOSSYTX channel\n"); + goto out; + } + + clear_bit(BUSY, &con->state); /* to avoid an improbable race */ + + con_close_socket(con); + con->in_msg = NULL; + + /* If there are no messages in the queue, place the connection + * in a STANDBY state (i.e., don't try to reconnect just yet). */ + mutex_lock(&con->out_mutex); + if (list_empty(&con->out_queue) && !con->out_keepalive_pending) { + dout("fault setting STANDBY\n"); + set_bit(STANDBY, &con->state); + mutex_unlock(&con->out_mutex); + goto out; + } + + /* Requeue anything that hasn't been acked, and retry after a + * delay. */ + list_splice_init(&con->out_sent, &con->out_queue); + mutex_unlock(&con->out_mutex); + + if (con->delay == 0) + con->delay = BASE_DELAY_INTERVAL; + else if (con->delay < MAX_DELAY_INTERVAL) + con->delay *= 2; + + /* explicitly schedule work to try to reconnect again later. */ + dout("fault queueing %p delay %lu\n", con, con->delay); + con->ops->get(con); + if (queue_delayed_work(ceph_msgr_wq, &con->work, + round_jiffies_relative(con->delay)) == 0) + con->ops->put(con); + +out: + if (con->ops->fault) + con->ops->fault(con); +} + + + +/* + * create a new messenger instance + */ +struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr) +{ + struct ceph_messenger *msgr; + + msgr = kzalloc(sizeof(*msgr), GFP_KERNEL); + if (msgr == NULL) + return ERR_PTR(-ENOMEM); + + spin_lock_init(&msgr->global_seq_lock); + + /* the zero page is needed if a request is "canceled" while the message + * is being written over the socket */ + msgr->zero_page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!msgr->zero_page) { + kfree(msgr); + return ERR_PTR(-ENOMEM); + } + kmap(msgr->zero_page); + + if (myaddr) + msgr->inst.addr = *myaddr; + + /* select a random nonce */ + get_random_bytes(&msgr->inst.addr.nonce, + sizeof(msgr->inst.addr.nonce)); + + dout("messenger_create %p\n", msgr); + return msgr; +} + +void ceph_messenger_destroy(struct ceph_messenger *msgr) +{ + dout("destroy %p\n", msgr); + kunmap(msgr->zero_page); + __free_page(msgr->zero_page); + kfree(msgr); + dout("destroyed messenger %p\n", msgr); +} + +/* + * Queue up an outgoing message on the given connection. + */ +void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg) +{ + if (test_bit(CLOSED, &con->state)) { + dout("con_send %p closed, dropping %p\n", con, msg); + ceph_msg_put(msg); + return; + } + + /* set src+dst */ + msg->hdr.src = con->msgr->inst; + msg->hdr.orig_src = con->msgr->inst; + msg->hdr.dst_erank = con->peer_addr.erank; + + /* queue */ + mutex_lock(&con->out_mutex); + BUG_ON(!list_empty(&msg->list_head)); + list_add_tail(&msg->list_head, &con->out_queue); + dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg, + ENTITY_NAME(con->peer_name), le16_to_cpu(msg->hdr.type), + ceph_msg_type_name(le16_to_cpu(msg->hdr.type)), + le32_to_cpu(msg->hdr.front_len), + le32_to_cpu(msg->hdr.middle_len), + le32_to_cpu(msg->hdr.data_len)); + mutex_unlock(&con->out_mutex); + + /* if there wasn't anything waiting to send before, queue + * new work */ + if (test_and_set_bit(WRITE_PENDING, &con->state) == 0) + queue_con(con); +} + +/* + * Revoke a message that was previously queued for send + */ +void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg) +{ + mutex_lock(&con->out_mutex); + if (!list_empty(&msg->list_head)) { + dout("con_revoke %p msg %p\n", con, msg); + list_del_init(&msg->list_head); + ceph_msg_put(msg); + msg->hdr.seq = 0; + if (con->out_msg == msg) + con->out_msg = NULL; + if (con->out_kvec_is_msg) { + con->out_skip = con->out_kvec_bytes; + con->out_kvec_is_msg = false; + } + } else { + dout("con_revoke %p msg %p - not queued (sent?)\n", con, msg); + } + mutex_unlock(&con->out_mutex); +} + +/* + * Queue a keepalive byte to ensure the tcp connection is alive. + */ +void ceph_con_keepalive(struct ceph_connection *con) +{ + if (test_and_set_bit(KEEPALIVE_PENDING, &con->state) == 0 && + test_and_set_bit(WRITE_PENDING, &con->state) == 0) + queue_con(con); +} + + +/* + * construct a new message with given type, size + * the new msg has a ref count of 1. + */ +struct ceph_msg *ceph_msg_new(int type, int front_len, + int page_len, int page_off, struct page **pages) +{ + struct ceph_msg *m; + + m = kmalloc(sizeof(*m), GFP_NOFS); + if (m == NULL) + goto out; + atomic_set(&m->nref, 1); + INIT_LIST_HEAD(&m->list_head); + + m->hdr.type = cpu_to_le16(type); + m->hdr.front_len = cpu_to_le32(front_len); + m->hdr.middle_len = 0; + m->hdr.data_len = cpu_to_le32(page_len); + m->hdr.data_off = cpu_to_le16(page_off); + m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT); + m->footer.front_crc = 0; + m->footer.middle_crc = 0; + m->footer.data_crc = 0; + m->front_max = front_len; + m->front_is_vmalloc = false; + m->more_to_follow = false; + m->pool = NULL; + + /* front */ + if (front_len) { + if (front_len > PAGE_CACHE_SIZE) { + m->front.iov_base = __vmalloc(front_len, GFP_NOFS, + PAGE_KERNEL); + m->front_is_vmalloc = true; + } else { + m->front.iov_base = kmalloc(front_len, GFP_NOFS); + } + if (m->front.iov_base == NULL) { + pr_err("msg_new can't allocate %d bytes\n", + front_len); + goto out2; + } + } else { + m->front.iov_base = NULL; + } + m->front.iov_len = front_len; + + /* middle */ + m->middle = NULL; + + /* data */ + m->nr_pages = calc_pages_for(page_off, page_len); + m->pages = pages; + + dout("ceph_msg_new %p page %d~%d -> %d\n", m, page_off, page_len, + m->nr_pages); + return m; + +out2: + ceph_msg_put(m); +out: + pr_err("msg_new can't create type %d len %d\n", type, front_len); + return ERR_PTR(-ENOMEM); +} + +/* + * Generic message allocator, for incoming messages. + */ +struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con, + struct ceph_msg_header *hdr) +{ + int type = le16_to_cpu(hdr->type); + int front_len = le32_to_cpu(hdr->front_len); + struct ceph_msg *msg = ceph_msg_new(type, front_len, 0, 0, NULL); + + if (!msg) { + pr_err("unable to allocate msg type %d len %d\n", + type, front_len); + return ERR_PTR(-ENOMEM); + } + return msg; +} + +/* + * Allocate "middle" portion of a message, if it is needed and wasn't + * allocated by alloc_msg. This allows us to read a small fixed-size + * per-type header in the front and then gracefully fail (i.e., + * propagate the error to the caller based on info in the front) when + * the middle is too large. + */ +int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg) +{ + int type = le16_to_cpu(msg->hdr.type); + int middle_len = le32_to_cpu(msg->hdr.middle_len); + + dout("alloc_middle %p type %d %s middle_len %d\n", msg, type, + ceph_msg_type_name(type), middle_len); + BUG_ON(!middle_len); + BUG_ON(msg->middle); + + msg->middle = ceph_buffer_new_alloc(middle_len, GFP_NOFS); + if (!msg->middle) + return -ENOMEM; + return 0; +} + + +/* + * Free a generically kmalloc'd message. + */ +void ceph_msg_kfree(struct ceph_msg *m) +{ + dout("msg_kfree %p\n", m); + if (m->front_is_vmalloc) + vfree(m->front.iov_base); + else + kfree(m->front.iov_base); + kfree(m); +} + +/* + * Drop a msg ref. Destroy as needed. + */ +void ceph_msg_put(struct ceph_msg *m) +{ + dout("ceph_msg_put %p %d -> %d\n", m, atomic_read(&m->nref), + atomic_read(&m->nref)-1); + if (atomic_read(&m->nref) <= 0) { + pr_err("bad ceph_msg_put on %p %llu %d=%s %d+%d\n", + m, le64_to_cpu(m->hdr.seq), + le16_to_cpu(m->hdr.type), + ceph_msg_type_name(le16_to_cpu(m->hdr.type)), + le32_to_cpu(m->hdr.front_len), + le32_to_cpu(m->hdr.data_len)); + WARN_ON(1); + } + if (atomic_dec_and_test(&m->nref)) { + dout("ceph_msg_put last one on %p\n", m); + WARN_ON(!list_empty(&m->list_head)); + + /* drop middle, data, if any */ + if (m->middle) { + ceph_buffer_put(m->middle); + m->middle = NULL; + } + m->nr_pages = 0; + m->pages = NULL; + + if (m->pool) + ceph_msgpool_put(m->pool, m); + else + ceph_msg_kfree(m); + } +} -- cgit v1.2.3 From 13e38c8ae771d73bf6d1f0f98e35f99c0f0d48ff Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 9 Oct 2009 16:36:34 -0700 Subject: ceph: update to mon client protocol v15 The mon request headers now include session_mon information that must be properly initialized. Signed-off-by: Sage Weil --- fs/ceph/ceph_fs.h | 19 ++++++++++++------- fs/ceph/messenger.c | 2 +- fs/ceph/mon_client.c | 8 ++++++-- 3 files changed, 19 insertions(+), 10 deletions(-) (limited to 'fs/ceph/messenger.c') diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h index 21ed51b127f..acf24c6944c 100644 --- a/fs/ceph/ceph_fs.h +++ b/fs/ceph/ceph_fs.h @@ -37,10 +37,10 @@ */ #define CEPH_OSD_PROTOCOL 7 /* cluster internal */ #define CEPH_MDS_PROTOCOL 9 /* cluster internal */ -#define CEPH_MON_PROTOCOL 4 /* cluster internal */ +#define CEPH_MON_PROTOCOL 5 /* cluster internal */ #define CEPH_OSDC_PROTOCOL 20 /* server/client */ #define CEPH_MDSC_PROTOCOL 29 /* server/client */ -#define CEPH_MONC_PROTOCOL 14 /* server/client */ +#define CEPH_MONC_PROTOCOL 15 /* server/client */ #define CEPH_INO_ROOT 1 @@ -118,9 +118,14 @@ struct ceph_file_layout { #define CEPH_MSG_OSD_OP 42 #define CEPH_MSG_OSD_OPREPLY 43 +struct ceph_mon_request_header { + __le64 have_version; + __le16 session_mon; + __le64 session_mon_tid; +} __attribute__ ((packed)); struct ceph_mon_statfs { - __le64 have_version; + struct ceph_mon_request_header monhdr; struct ceph_fsid fsid; __le64 tid; } __attribute__ ((packed)); @@ -138,22 +143,22 @@ struct ceph_mon_statfs_reply { } __attribute__ ((packed)); struct ceph_osd_getmap { - __le64 have_version; + struct ceph_mon_request_header monhdr; struct ceph_fsid fsid; __le32 start; } __attribute__ ((packed)); struct ceph_mds_getmap { - __le64 have_version; + struct ceph_mon_request_header monhdr; struct ceph_fsid fsid; } __attribute__ ((packed)); struct ceph_client_mount { - __le64 have_version; + struct ceph_mon_request_header monhdr; } __attribute__ ((packed)); struct ceph_mon_subscribe_item { - __le64 have; + __le64 have_version; __le64 have; __u8 onetime; } __attribute__ ((packed)); diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c index 63f7f135938..b48abc0b3be 100644 --- a/fs/ceph/messenger.c +++ b/fs/ceph/messenger.c @@ -857,7 +857,7 @@ out: static int verify_hello(struct ceph_connection *con) { if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) { - pr_err("connect to/from %s has bad banner\n", + pr_err("connect to %s got bad banner\n", pr_addr(&con->peer_addr.in_addr)); con->error_msg = "protocol error, bad banner"; return -1; diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c index 9c34df17fa4..dc698caccc4 100644 --- a/fs/ceph/mon_client.c +++ b/fs/ceph/mon_client.c @@ -273,7 +273,9 @@ static void __request_mount(struct ceph_mon_client *monc) if (IS_ERR(msg)) return; h = msg->front.iov_base; - h->have_version = 0; + h->monhdr.have_version = 0; + h->monhdr.session_mon = cpu_to_le16(-1); + h->monhdr.session_mon_tid = 0; ceph_con_send(monc->con, msg); } @@ -422,7 +424,9 @@ static int send_statfs(struct ceph_mon_client *monc, return PTR_ERR(msg); req->request = msg; h = msg->front.iov_base; - h->have_version = 0; + h->monhdr.have_version = 0; + h->monhdr.session_mon = cpu_to_le16(-1); + h->monhdr.session_mon_tid = 0; h->fsid = monc->monmap->fsid; h->tid = cpu_to_le64(req->tid); ceph_con_send(monc->con, msg); -- cgit v1.2.3 From 63f2d211954b790fea0a9caeae605c7956535af6 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 3 Nov 2009 15:17:56 -0800 Subject: ceph: use fixed endian encoding for ceph_entity_addr We exchange struct ceph_entity_addr over the wire and store it on disk. The sockaddr_storage.ss_family field, however, is host endianness. So, fix ss_family endianness to big endian when sending/receiving over the wire. Signed-off-by: Sage Weil --- fs/ceph/decode.h | 16 ++++++++++++++-- fs/ceph/mdsmap.c | 1 + fs/ceph/messenger.c | 23 ++++++++++++++++++----- fs/ceph/messenger.h | 1 + fs/ceph/mon_client.c | 2 ++ fs/ceph/msgr.h | 2 +- fs/ceph/osdmap.c | 3 +++ 7 files changed, 40 insertions(+), 8 deletions(-) (limited to 'fs/ceph/messenger.c') diff --git a/fs/ceph/decode.h b/fs/ceph/decode.h index 91179fb2cc3..a382aecc55b 100644 --- a/fs/ceph/decode.h +++ b/fs/ceph/decode.h @@ -76,18 +76,30 @@ static inline void ceph_decode_copy(void **p, void *pv, size_t n) * struct ceph_timespec <-> struct timespec */ static inline void ceph_decode_timespec(struct timespec *ts, - struct ceph_timespec *tv) + const struct ceph_timespec *tv) { ts->tv_sec = le32_to_cpu(tv->tv_sec); ts->tv_nsec = le32_to_cpu(tv->tv_nsec); } static inline void ceph_encode_timespec(struct ceph_timespec *tv, - struct timespec *ts) + const struct timespec *ts) { tv->tv_sec = cpu_to_le32(ts->tv_sec); tv->tv_nsec = cpu_to_le32(ts->tv_nsec); } +/* + * sockaddr_storage <-> ceph_sockaddr + */ +static inline void ceph_encode_addr(struct ceph_entity_addr *a) +{ + a->in_addr.ss_family = htons(a->in_addr.ss_family); +} +static inline void ceph_decode_addr(struct ceph_entity_addr *a) +{ + a->in_addr.ss_family = ntohs(a->in_addr.ss_family); +} + /* * encoders */ diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index 80daea06447..4226c810ce2 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -86,6 +86,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) ceph_decode_need(p, end, sizeof(addr) + 1 + sizeof(u32), bad); ceph_decode_copy(p, &addr, sizeof(addr)); + ceph_decode_addr(&addr); infoversion = ceph_decode_8(p); namelen = ceph_decode_32(p); /* skip mds name */ *p += namelen; diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c index b48abc0b3be..6ff44bbddf6 100644 --- a/fs/ceph/messenger.c +++ b/fs/ceph/messenger.c @@ -12,6 +12,7 @@ #include "super.h" #include "messenger.h" +#include "decode.h" /* * Ceph uses the messenger to exchange ceph_msg messages with other @@ -97,6 +98,12 @@ const char *pr_addr(const struct sockaddr_storage *ss) return s; } +static void encode_my_addr(struct ceph_messenger *msgr) +{ + memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr)); + ceph_encode_addr(&msgr->my_enc_addr); +} + /* * work queue for all reading and writing to/from the socket. */ @@ -590,12 +597,12 @@ static void prepare_write_connect(struct ceph_messenger *msgr, con->out_kvec[0].iov_base = CEPH_BANNER; con->out_kvec[0].iov_len = len; - con->out_kvec[1].iov_base = &msgr->inst.addr; - con->out_kvec[1].iov_len = sizeof(msgr->inst.addr); + con->out_kvec[1].iov_base = &msgr->my_enc_addr; + con->out_kvec[1].iov_len = sizeof(msgr->my_enc_addr); con->out_kvec[2].iov_base = &con->out_connect; con->out_kvec[2].iov_len = sizeof(con->out_connect); con->out_kvec_left = 3; - con->out_kvec_bytes = len + sizeof(msgr->inst.addr) + + con->out_kvec_bytes = len + sizeof(msgr->my_enc_addr) + sizeof(con->out_connect); con->out_kvec_cur = con->out_kvec; con->out_more = 0; @@ -976,6 +983,9 @@ static int process_connect(struct ceph_connection *con) if (verify_hello(con) < 0) return -1; + ceph_decode_addr(&con->actual_peer_addr); + ceph_decode_addr(&con->peer_addr_for_me); + /* * Make sure the other end is who we wanted. note that the other * end may not yet know their ip address, so if it's 0.0.0.0, give @@ -1005,6 +1015,7 @@ static int process_connect(struct ceph_connection *con) &con->peer_addr_for_me.in_addr, sizeof(con->peer_addr_for_me.in_addr)); addr_set_port(&con->msgr->inst.addr.in_addr, port); + encode_my_addr(con->msgr); dout("process_connect learned my addr is %s\n", pr_addr(&con->msgr->inst.addr.in_addr)); } @@ -1780,6 +1791,7 @@ struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr) /* select a random nonce */ get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce)); + encode_my_addr(msgr); dout("messenger_create %p\n", msgr); return msgr; @@ -1806,8 +1818,9 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg) } /* set src+dst */ - msg->hdr.src = con->msgr->inst; - msg->hdr.orig_src = con->msgr->inst; + msg->hdr.src.name = con->msgr->inst.name; + msg->hdr.src.addr = con->msgr->my_enc_addr; + msg->hdr.orig_src = msg->hdr.src; msg->hdr.dst_erank = con->peer_addr.erank; /* queue */ diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h index dcd98b64dca..e016fa7cf97 100644 --- a/fs/ceph/messenger.h +++ b/fs/ceph/messenger.h @@ -53,6 +53,7 @@ extern const char *ceph_name_type_str(int t); struct ceph_messenger { struct ceph_entity_inst inst; /* my name+address */ + struct ceph_entity_addr my_enc_addr; struct page *zero_page; /* used in certain error cases */ bool nocrc; diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c index 61263c99c6a..95b76e761e1 100644 --- a/fs/ceph/mon_client.c +++ b/fs/ceph/mon_client.c @@ -59,6 +59,8 @@ struct ceph_monmap *ceph_monmap_decode(void *p, void *end) m->epoch = epoch; m->num_mon = num_mon; ceph_decode_copy(&p, m->mon_inst, num_mon*sizeof(m->mon_inst[0])); + for (i = 0; i < num_mon; i++) + ceph_decode_addr(&m->mon_inst[i].addr); dout("monmap_decode epoch %d, num_mon %d\n", m->epoch, m->num_mon); diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h index 9abc879e25b..8e3ea2eb1bf 100644 --- a/fs/ceph/msgr.h +++ b/fs/ceph/msgr.h @@ -21,7 +21,7 @@ * whenever the wire protocol changes. try to keep this string length * constant. */ -#define CEPH_BANNER "ceph v022" +#define CEPH_BANNER "ceph v023" #define CEPH_BANNER_MAX_LEN 30 diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c index d62e111b8a3..cd7bb265d78 100644 --- a/fs/ceph/osdmap.c +++ b/fs/ceph/osdmap.c @@ -460,6 +460,8 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) *p += 4; /* skip length field (should match max) */ ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr)); + for (i = 0; i < map->max_osd; i++) + ceph_decode_addr(&map->osd_addr[i]); /* pg_temp */ ceph_decode_32_safe(p, end, len, bad); @@ -619,6 +621,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, struct ceph_entity_addr addr; ceph_decode_32_safe(p, end, osd, bad); ceph_decode_copy_safe(p, end, &addr, sizeof(addr), bad); + ceph_decode_addr(&addr); pr_info("osd%d up\n", osd); BUG_ON(osd >= map->max_osd); map->osd_state[osd] |= CEPH_OSD_UP; -- cgit v1.2.3 From f28bcfbe660a3246621a367020054d4f1a179cd9 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 4 Nov 2009 11:46:35 -0800 Subject: ceph: convert port endianness The port is informational only, but we should make it correct. Signed-off-by: Sage Weil --- fs/ceph/messenger.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/ceph/messenger.c') diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c index 6ff44bbddf6..5cc374850d8 100644 --- a/fs/ceph/messenger.c +++ b/fs/ceph/messenger.c @@ -891,9 +891,9 @@ static int addr_port(struct sockaddr_storage *ss) { switch (ss->ss_family) { case AF_INET: - return ((struct sockaddr_in *)ss)->sin_port; + return ntohs(((struct sockaddr_in *)ss)->sin_port); case AF_INET6: - return ((struct sockaddr_in6 *)ss)->sin6_port; + return ntohs(((struct sockaddr_in6 *)ss)->sin6_port); } return 0; } -- cgit v1.2.3 From eed0ef2caf928327332da54d23579debe629d5bc Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 10 Nov 2009 14:34:36 -0800 Subject: ceph: separate banner and connect during handshake into distinct stages We need to make sure we only swab the address during the banner once. So break process_banner out of process_connect, and clean up the surrounding code so that these are distinct phases of the handshake. Signed-off-by: Sage Weil --- fs/ceph/messenger.c | 117 +++++++++++++++++++++++++++++++++------------------- fs/ceph/messenger.h | 4 +- 2 files changed, 77 insertions(+), 44 deletions(-) (limited to 'fs/ceph/messenger.c') diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c index 5cc374850d8..e389656b2a6 100644 --- a/fs/ceph/messenger.c +++ b/fs/ceph/messenger.c @@ -564,10 +564,26 @@ static void prepare_write_keepalive(struct ceph_connection *con) /* * We connected to a peer and are saying hello. */ -static void prepare_write_connect(struct ceph_messenger *msgr, - struct ceph_connection *con) +static void prepare_write_banner(struct ceph_messenger *msgr, + struct ceph_connection *con) { int len = strlen(CEPH_BANNER); + + con->out_kvec[0].iov_base = CEPH_BANNER; + con->out_kvec[0].iov_len = len; + con->out_kvec[1].iov_base = &msgr->my_enc_addr; + con->out_kvec[1].iov_len = sizeof(msgr->my_enc_addr); + con->out_kvec_left = 2; + con->out_kvec_bytes = len + sizeof(msgr->my_enc_addr); + con->out_kvec_cur = con->out_kvec; + con->out_more = 0; + set_bit(WRITE_PENDING, &con->state); +} + +static void prepare_write_connect(struct ceph_messenger *msgr, + struct ceph_connection *con, + int after_banner) +{ unsigned global_seq = get_global_seq(con->msgr, 0); int proto; @@ -595,32 +611,14 @@ static void prepare_write_connect(struct ceph_messenger *msgr, if (test_bit(LOSSYTX, &con->state)) con->out_connect.flags = CEPH_MSG_CONNECT_LOSSY; - con->out_kvec[0].iov_base = CEPH_BANNER; - con->out_kvec[0].iov_len = len; - con->out_kvec[1].iov_base = &msgr->my_enc_addr; - con->out_kvec[1].iov_len = sizeof(msgr->my_enc_addr); - con->out_kvec[2].iov_base = &con->out_connect; - con->out_kvec[2].iov_len = sizeof(con->out_connect); - con->out_kvec_left = 3; - con->out_kvec_bytes = len + sizeof(msgr->my_enc_addr) + - sizeof(con->out_connect); - con->out_kvec_cur = con->out_kvec; - con->out_more = 0; - set_bit(WRITE_PENDING, &con->state); -} - -static void prepare_write_connect_retry(struct ceph_messenger *msgr, - struct ceph_connection *con) -{ - dout("prepare_write_connect_retry %p\n", con); - con->out_connect.connect_seq = cpu_to_le32(con->connect_seq); - con->out_connect.global_seq = - cpu_to_le32(get_global_seq(con->msgr, 0)); - - con->out_kvec[0].iov_base = &con->out_connect; - con->out_kvec[0].iov_len = sizeof(con->out_connect); - con->out_kvec_left = 1; - con->out_kvec_bytes = sizeof(con->out_connect); + if (!after_banner) { + con->out_kvec_left = 0; + con->out_kvec_bytes = 0; + } + con->out_kvec[con->out_kvec_left].iov_base = &con->out_connect; + con->out_kvec[con->out_kvec_left].iov_len = sizeof(con->out_connect); + con->out_kvec_left++; + con->out_kvec_bytes += sizeof(con->out_connect); con->out_kvec_cur = con->out_kvec; con->out_more = 0; set_bit(WRITE_PENDING, &con->state); @@ -778,6 +776,12 @@ out: /* * Prepare to read connection handshake, or an ack. */ +static void prepare_read_banner(struct ceph_connection *con) +{ + dout("prepare_read_banner %p\n", con); + con->in_base_pos = 0; +} + static void prepare_read_connect(struct ceph_connection *con) { dout("prepare_read_connect %p\n", con); @@ -829,11 +833,11 @@ static int read_partial(struct ceph_connection *con, /* * Read all or part of the connect-side handshake on a new connection */ -static int read_partial_connect(struct ceph_connection *con) +static int read_partial_banner(struct ceph_connection *con) { int ret, to = 0; - dout("read_partial_connect %p at %d\n", con, con->in_base_pos); + dout("read_partial_banner %p at %d\n", con, con->in_base_pos); /* peer's banner */ ret = read_partial(con, &to, strlen(CEPH_BANNER), con->in_banner); @@ -847,6 +851,16 @@ static int read_partial_connect(struct ceph_connection *con) &con->peer_addr_for_me); if (ret <= 0) goto out; +out: + return ret; +} + +static int read_partial_connect(struct ceph_connection *con) +{ + int ret, to = 0; + + dout("read_partial_connect %p at %d\n", con, con->in_base_pos); + ret = read_partial(con, &to, sizeof(con->in_reply), &con->in_reply); if (ret <= 0) goto out; @@ -856,6 +870,7 @@ static int read_partial_connect(struct ceph_connection *con) le32_to_cpu(con->in_reply.global_seq)); out: return ret; + } /* @@ -976,9 +991,9 @@ bad: return -EINVAL; } -static int process_connect(struct ceph_connection *con) +static int process_banner(struct ceph_connection *con) { - dout("process_connect on %p tag %d\n", con, (int)con->in_tag); + dout("process_banner on %p\n", con); if (verify_hello(con) < 0) return -1; @@ -1016,10 +1031,19 @@ static int process_connect(struct ceph_connection *con) sizeof(con->peer_addr_for_me.in_addr)); addr_set_port(&con->msgr->inst.addr.in_addr, port); encode_my_addr(con->msgr); - dout("process_connect learned my addr is %s\n", + dout("process_banner learned my addr is %s\n", pr_addr(&con->msgr->inst.addr.in_addr)); } + set_bit(NEGOTIATING, &con->state); + prepare_read_connect(con); + return 0; +} + +static int process_connect(struct ceph_connection *con) +{ + dout("process_connect on %p tag %d\n", con, (int)con->in_tag); + switch (con->in_reply.tag) { case CEPH_MSGR_TAG_BADPROTOVER: dout("process_connect got BADPROTOVER my %d != their %d\n", @@ -1053,7 +1077,7 @@ static int process_connect(struct ceph_connection *con) ENTITY_NAME(con->peer_name), pr_addr(&con->peer_addr.in_addr)); reset_connection(con); - prepare_write_connect_retry(con->msgr, con); + prepare_write_connect(con->msgr, con, 0); prepare_read_connect(con); /* Tell ceph about it. */ @@ -1071,7 +1095,7 @@ static int process_connect(struct ceph_connection *con) le32_to_cpu(con->out_connect.connect_seq), le32_to_cpu(con->in_connect.connect_seq)); con->connect_seq = le32_to_cpu(con->in_connect.connect_seq); - prepare_write_connect_retry(con->msgr, con); + prepare_write_connect(con->msgr, con, 0); prepare_read_connect(con); break; @@ -1080,19 +1104,17 @@ static int process_connect(struct ceph_connection *con) * If we sent a smaller global_seq than the peer has, try * again with a larger value. */ - dout("process_connect got RETRY_GLOBAL my %u, peer_gseq = %u\n", + dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n", con->peer_global_seq, le32_to_cpu(con->in_connect.global_seq)); get_global_seq(con->msgr, le32_to_cpu(con->in_connect.global_seq)); - prepare_write_connect_retry(con->msgr, con); + prepare_write_connect(con->msgr, con, 0); prepare_read_connect(con); break; case CEPH_MSGR_TAG_READY: clear_bit(CONNECTING, &con->state); - if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY) - set_bit(LOSSYRX, &con->state); con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq); con->connect_seq++; dout("process_connect got READY gseq %d cseq %d (%d)\n", @@ -1420,9 +1442,11 @@ more: if (test_and_clear_bit(STANDBY, &con->state)) con->connect_seq++; - prepare_write_connect(msgr, con); - prepare_read_connect(con); + prepare_write_banner(msgr, con); + prepare_write_connect(msgr, con, 1); + prepare_read_banner(con); set_bit(CONNECTING, &con->state); + clear_bit(NEGOTIATING, &con->state); con->in_tag = CEPH_MSGR_TAG_READY; dout("try_write initiating connect on %p new state %lu\n", @@ -1521,7 +1545,16 @@ more: dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag, con->in_base_pos); if (test_bit(CONNECTING, &con->state)) { - dout("try_read connecting\n"); + if (!test_bit(NEGOTIATING, &con->state)) { + dout("try_read connecting\n"); + ret = read_partial_banner(con); + if (ret <= 0) + goto done; + if (process_banner(con) < 0) { + ret = -1; + goto out; + } + } ret = read_partial_connect(con); if (ret <= 0) goto done; diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h index e016fa7cf97..80f7e1e9444 100644 --- a/fs/ceph/messenger.h +++ b/fs/ceph/messenger.h @@ -104,8 +104,8 @@ struct ceph_msg_pos { * thread is currently opening, reading or writing data to the socket. */ #define LOSSYTX 0 /* we can close channel or drop messages on errors */ -#define LOSSYRX 1 /* peer may reset/drop messages */ -#define CONNECTING 2 +#define CONNECTING 1 +#define NEGOTIATING 2 #define KEEPALIVE_PENDING 3 #define WRITE_PENDING 4 /* we have data ready to send */ #define QUEUED 5 /* there is work queued on this connection */ -- cgit v1.2.3 From 71ececdacae24be333c534869cb1b06357f0e215 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 18 Nov 2009 11:27:06 -0800 Subject: ceph: remove unnecessary ceph_con_shutdown We require that ceph_con_close be called before we drop the connection, so this is unneeded. Just BUG if con->sock != NULL. Signed-off-by: Sage Weil --- fs/ceph/messenger.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) (limited to 'fs/ceph/messenger.c') diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c index e389656b2a6..d8a6a56a157 100644 --- a/fs/ceph/messenger.c +++ b/fs/ceph/messenger.c @@ -339,17 +339,6 @@ void ceph_con_close(struct ceph_connection *con) queue_con(con); } -/* - * clean up connection state - */ -void ceph_con_shutdown(struct ceph_connection *con) -{ - dout("con_shutdown %p\n", con); - reset_connection(con); - set_bit(DEAD, &con->state); - con_close_socket(con); /* silently ignore errors */ -} - /* * Reopen a closed connection, with a new peer address. */ @@ -380,7 +369,7 @@ void ceph_con_put(struct ceph_connection *con) atomic_read(&con->nref), atomic_read(&con->nref) - 1); BUG_ON(atomic_read(&con->nref) == 0); if (atomic_dec_and_test(&con->nref)) { - ceph_con_shutdown(con); + BUG_ON(con->sock); kfree(con); } } -- cgit v1.2.3 From 4e7a5dcd1bbab6560fbc8ada29a840e7a20ed7bc Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 18 Nov 2009 16:19:57 -0800 Subject: ceph: negotiate authentication protocol; implement AUTH_NONE protocol When we open a monitor session, we send an initial AUTH message listing the auth protocols we support, our entity name, and (possibly) a previously assigned global_id. The monitor chooses a protocol and responds with an initial message. Initially implement AUTH_NONE, a dummy protocol that provides no security, but works within the new framework. It generates 'authorizers' that are used when connecting to (mds, osd) services that simply state our entity name and global_id. This is a wire protocol change. Signed-off-by: Sage Weil --- fs/ceph/Makefile | 1 + fs/ceph/auth.c | 220 +++++++++++++++++++++++++++++++++++++++++++++++++ fs/ceph/auth.h | 77 +++++++++++++++++ fs/ceph/auth_none.c | 120 +++++++++++++++++++++++++++ fs/ceph/auth_none.h | 28 +++++++ fs/ceph/ceph_fs.h | 14 +++- fs/ceph/ceph_strings.c | 13 +++ fs/ceph/decode.h | 6 ++ fs/ceph/mds_client.c | 69 +++++++++++++++- fs/ceph/mds_client.h | 4 + fs/ceph/messenger.c | 54 +++++++++++- fs/ceph/messenger.h | 10 +++ fs/ceph/mon_client.c | 210 +++++++++++++++++++++++++++++++++++++--------- fs/ceph/mon_client.h | 14 ++-- fs/ceph/msgr.h | 21 +++-- fs/ceph/osd_client.c | 63 +++++++++++++- fs/ceph/osd_client.h | 4 + fs/ceph/rados.h | 2 +- fs/ceph/super.c | 32 ++++--- fs/ceph/super.h | 2 + 20 files changed, 888 insertions(+), 76 deletions(-) create mode 100644 fs/ceph/auth.c create mode 100644 fs/ceph/auth.h create mode 100644 fs/ceph/auth_none.c create mode 100644 fs/ceph/auth_none.h (limited to 'fs/ceph/messenger.c') diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile index bdd3e6fc160..827629c8576 100644 --- a/fs/ceph/Makefile +++ b/fs/ceph/Makefile @@ -13,6 +13,7 @@ ceph-objs := super.o inode.o dir.o file.o addr.o ioctl.o \ mon_client.o \ osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \ debugfs.o \ + auth.o auth_none.o \ ceph_fs.o ceph_strings.o ceph_hash.o ceph_frag.o else diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c new file mode 100644 index 00000000000..c4d1eee827a --- /dev/null +++ b/fs/ceph/auth.c @@ -0,0 +1,220 @@ +#include "ceph_debug.h" + +#include +#include + +#include "types.h" +#include "auth_none.h" +#include "decode.h" +#include "super.h" + +#include "messenger.h" + +/* + * get protocol handler + */ +static u32 supported_protocols[] = { + CEPH_AUTH_NONE +}; + +int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol) +{ + switch (protocol) { + case CEPH_AUTH_NONE: + return ceph_auth_none_init(ac); + default: + return -ENOENT; + } +} + +/* + * setup, teardown. + */ +struct ceph_auth_client *ceph_auth_init(const char *name, const char *secret) +{ + struct ceph_auth_client *ac; + int ret; + + dout("auth_init name '%s' secret '%s'\n", name, secret); + + ret = -ENOMEM; + ac = kzalloc(sizeof(*ac), GFP_NOFS); + if (!ac) + goto out; + + ac->negotiating = true; + if (name) + ac->name = name; + else + ac->name = CEPH_AUTH_NAME_DEFAULT; + dout("auth_init name %s secret %s\n", ac->name, secret); + ac->secret = secret; + return ac; + +out: + return ERR_PTR(ret); +} + +void ceph_auth_destroy(struct ceph_auth_client *ac) +{ + dout("auth_destroy %p\n", ac); + if (ac->ops) + ac->ops->destroy(ac); + kfree(ac); +} + +/* + * Reset occurs when reconnecting to the monitor. + */ +void ceph_auth_reset(struct ceph_auth_client *ac) +{ + dout("auth_reset %p\n", ac); + if (ac->ops && !ac->negotiating) + ac->ops->reset(ac); + ac->negotiating = true; +} + +int ceph_entity_name_encode(const char *name, void **p, void *end) +{ + int len = strlen(name); + + if (*p + 2*sizeof(u32) + len > end) + return -ERANGE; + ceph_encode_32(p, CEPH_ENTITY_TYPE_CLIENT); + ceph_encode_32(p, len); + ceph_encode_copy(p, name, len); + return 0; +} + +/* + * Initiate protocol negotiation with monitor. Include entity name + * and list supported protocols. + */ +int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len) +{ + struct ceph_mon_request_header *monhdr = buf; + void *p = monhdr + 1, *end = buf + len, *lenp; + int i, num; + int ret; + + dout("auth_build_hello\n"); + monhdr->have_version = 0; + monhdr->session_mon = cpu_to_le16(-1); + monhdr->session_mon_tid = 0; + + ceph_encode_32(&p, 0); /* no protocol, yet */ + + lenp = p; + p += sizeof(u32); + + num = ARRAY_SIZE(supported_protocols); + ceph_encode_32(&p, num); + for (i = 0; i < num; i++) + ceph_encode_32(&p, supported_protocols[i]); + + ret = ceph_entity_name_encode(ac->name, &p, end); + if (ret < 0) + return ret; + ceph_decode_need(&p, end, sizeof(u64), bad); + ceph_encode_64(&p, ac->global_id); + + ceph_encode_32(&lenp, p - lenp - sizeof(u32)); + return p - buf; + +bad: + return -ERANGE; +} + +/* + * Handle auth message from monitor. + */ +int ceph_handle_auth_reply(struct ceph_auth_client *ac, + void *buf, size_t len, + void *reply_buf, size_t reply_len) +{ + void *p = buf; + void *end = buf + len; + int protocol; + s32 result; + u64 global_id; + void *payload, *payload_end; + int payload_len; + char *result_msg; + int result_msg_len; + int ret = -EINVAL; + + dout("handle_auth_reply %p %p\n", p, end); + ceph_decode_need(&p, end, sizeof(u32) * 3 + sizeof(u64), bad); + protocol = ceph_decode_32(&p); + result = ceph_decode_32(&p); + global_id = ceph_decode_64(&p); + payload_len = ceph_decode_32(&p); + payload = p; + p += payload_len; + ceph_decode_need(&p, end, sizeof(u32), bad); + result_msg_len = ceph_decode_32(&p); + result_msg = p; + p += result_msg_len; + if (p != end) + goto bad; + + dout(" result %d '%.*s' gid %llu len %d\n", result, result_msg_len, + result_msg, global_id, payload_len); + + payload_end = payload + payload_len; + + if (global_id && ac->global_id != global_id) { + dout(" set global_id %lld -> %lld\n", ac->global_id, global_id); + ac->global_id = global_id; + } + + if (ac->negotiating) { + /* set up (new) protocol handler? */ + if (ac->protocol && ac->protocol != protocol) { + ac->ops->destroy(ac); + ac->protocol = 0; + ac->ops = NULL; + } + if (ac->protocol != protocol) { + ret = ceph_auth_init_protocol(ac, protocol); + if (ret) { + pr_err("error %d on auth protocol %d init\n", + ret, protocol); + goto out; + } + } + } + + ret = ac->ops->handle_reply(ac, result, payload, payload_end); + if (ret == -EAGAIN) { + struct ceph_mon_request_header *monhdr = reply_buf; + void *p = reply_buf + 1; + void *end = reply_buf + reply_len; + + monhdr->have_version = 0; + monhdr->session_mon = cpu_to_le16(-1); + monhdr->session_mon_tid = 0; + + ceph_encode_32(&p, ac->protocol); + + ret = ac->ops->build_request(ac, p + sizeof(u32), end); + if (ret < 0) { + pr_err("error %d building request\n", ret); + goto out; + } + dout(" built request %d bytes\n", ret); + ceph_encode_32(&p, ret); + return p + ret - reply_buf; + } else if (ret) { + pr_err("authentication error %d\n", ret); + return ret; + } + return 0; + +bad: + pr_err("failed to decode auth msg\n"); +out: + return ret; +} + + diff --git a/fs/ceph/auth.h b/fs/ceph/auth.h new file mode 100644 index 00000000000..4d8cdf6bb3b --- /dev/null +++ b/fs/ceph/auth.h @@ -0,0 +1,77 @@ +#ifndef _FS_CEPH_AUTH_H +#define _FS_CEPH_AUTH_H + +#include "types.h" +#include "buffer.h" + +/* + * Abstract interface for communicating with the authenticate module. + * There is some handshake that takes place between us and the monitor + * to acquire the necessary keys. These are used to generate an + * 'authorizer' that we use when connecting to a service (mds, osd). + */ + +struct ceph_auth_client; +struct ceph_authorizer; + +struct ceph_auth_client_ops { + /* + * true if we are authenticated and can connect to + * services. + */ + int (*is_authenticated)(struct ceph_auth_client *ac); + + /* + * build requests and process replies during monitor + * handshake. if handle_reply returns -EAGAIN, we build + * another request. + */ + int (*build_request)(struct ceph_auth_client *ac, void *buf, void *end); + int (*handle_reply)(struct ceph_auth_client *ac, int result, + void *buf, void *end); + + /* + * Create authorizer for connecting to a service, and verify + * the response to authenticate the service. + */ + int (*create_authorizer)(struct ceph_auth_client *ac, int peer_type, + struct ceph_authorizer **a, + void **buf, size_t *len, + void **reply_buf, size_t *reply_len); + int (*verify_authorizer_reply)(struct ceph_auth_client *ac, + struct ceph_authorizer *a, size_t len); + void (*destroy_authorizer)(struct ceph_auth_client *ac, + struct ceph_authorizer *a); + + /* reset when we (re)connect to a monitor */ + void (*reset)(struct ceph_auth_client *ac); + + void (*destroy)(struct ceph_auth_client *ac); +}; + +struct ceph_auth_client { + u32 protocol; /* CEPH_AUTH_* */ + void *private; /* for use by protocol implementation */ + const struct ceph_auth_client_ops *ops; /* null iff protocol==0 */ + + bool negotiating; /* true if negotiating protocol */ + const char *name; /* entity name */ + u64 global_id; /* our unique id in system */ + const char *secret; /* our secret key */ + unsigned want_keys; /* which services we want */ +}; + +extern struct ceph_auth_client *ceph_auth_init(const char *name, + const char *secret); +extern void ceph_auth_destroy(struct ceph_auth_client *ac); + +extern void ceph_auth_reset(struct ceph_auth_client *ac); + +extern int ceph_auth_build_hello(struct ceph_auth_client *ac, + void *buf, size_t len); +extern int ceph_handle_auth_reply(struct ceph_auth_client *ac, + void *buf, size_t len, + void *reply_buf, size_t reply_len); +extern int ceph_entity_name_encode(const char *name, void **p, void *end); + +#endif diff --git a/fs/ceph/auth_none.c b/fs/ceph/auth_none.c new file mode 100644 index 00000000000..631017eb711 --- /dev/null +++ b/fs/ceph/auth_none.c @@ -0,0 +1,120 @@ + +#include "ceph_debug.h" + +#include +#include +#include + +#include "auth_none.h" +#include "auth.h" +#include "decode.h" + +static void reset(struct ceph_auth_client *ac) +{ + struct ceph_auth_none_info *xi = ac->private; + + xi->starting = true; + xi->built_authorizer = false; +} + +static void destroy(struct ceph_auth_client *ac) +{ + kfree(ac->private); + ac->private = NULL; +} + +static int is_authenticated(struct ceph_auth_client *ac) +{ + struct ceph_auth_none_info *xi = ac->private; + + return !xi->starting; +} + +/* + * the generic auth code decode the global_id, and we carry no actual + * authenticate state, so nothing happens here. + */ +static int handle_reply(struct ceph_auth_client *ac, int result, + void *buf, void *end) +{ + struct ceph_auth_none_info *xi = ac->private; + + xi->starting = false; + return result; +} + +/* + * build an 'authorizer' with our entity_name and global_id. we can + * reuse a single static copy since it is identical for all services + * we connect to. + */ +static int ceph_auth_none_create_authorizer( + struct ceph_auth_client *ac, int peer_type, + struct ceph_authorizer **a, + void **buf, size_t *len, + void **reply_buf, size_t *reply_len) +{ + struct ceph_auth_none_info *ai = ac->private; + struct ceph_none_authorizer *au = &ai->au; + void *p, *end; + int ret; + + if (!ai->built_authorizer) { + p = au->buf; + end = p + sizeof(au->buf); + ret = ceph_entity_name_encode(ac->name, &p, end - 8); + if (ret < 0) + goto bad; + ceph_decode_need(&p, end, sizeof(u64), bad2); + ceph_encode_64(&p, ac->global_id); + au->buf_len = p - (void *)au->buf; + ai->built_authorizer = true; + dout("built authorizer len %d\n", au->buf_len); + } + + *a = (struct ceph_authorizer *)au; + *buf = au->buf; + *len = au->buf_len; + *reply_buf = au->reply_buf; + *reply_len = sizeof(au->reply_buf); + return 0; + +bad2: + ret = -ERANGE; +bad: + return ret; +} + +static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac, + struct ceph_authorizer *a) +{ + /* nothing to do */ +} + +static const struct ceph_auth_client_ops ceph_auth_none_ops = { + .reset = reset, + .destroy = destroy, + .is_authenticated = is_authenticated, + .handle_reply = handle_reply, + .create_authorizer = ceph_auth_none_create_authorizer, + .destroy_authorizer = ceph_auth_none_destroy_authorizer, +}; + +int ceph_auth_none_init(struct ceph_auth_client *ac) +{ + struct ceph_auth_none_info *xi; + + dout("ceph_auth_none_init %p\n", ac); + xi = kzalloc(sizeof(*xi), GFP_NOFS); + if (!xi) + return -ENOMEM; + + xi->starting = true; + xi->built_authorizer = false; + + ac->protocol = CEPH_AUTH_NONE; + ac->private = xi; + ac->ops = &ceph_auth_none_ops; + return 0; +} + diff --git a/fs/ceph/auth_none.h b/fs/ceph/auth_none.h new file mode 100644 index 00000000000..56c05533a31 --- /dev/null +++ b/fs/ceph/auth_none.h @@ -0,0 +1,28 @@ +#ifndef _FS_CEPH_AUTH_NONE_H +#define _FS_CEPH_AUTH_NONE_H + +#include "auth.h" + +/* + * null security mode. + * + * we use a single static authorizer that simply encodes our entity name + * and global id. + */ + +struct ceph_none_authorizer { + char buf[128]; + int buf_len; + char reply_buf[0]; +}; + +struct ceph_auth_none_info { + bool starting; + bool built_authorizer; + struct ceph_none_authorizer au; /* we only need one; it's static */ +}; + +extern int ceph_auth_none_init(struct ceph_auth_client *ac); + +#endif + diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h index 36becb02478..1e96a9a87d8 100644 --- a/fs/ceph/ceph_fs.h +++ b/fs/ceph/ceph_fs.h @@ -75,6 +75,16 @@ struct ceph_file_layout { int ceph_file_layout_is_valid(const struct ceph_file_layout *layout); +/* crypto algorithms */ +#define CEPH_CRYPTO_NONE 0x0 +#define CEPH_CRYPTO_AES 0x1 + +/* security/authentication protocols */ +#define CEPH_AUTH_UNKNOWN 0x0 +#define CEPH_AUTH_NONE 0x1 +#define CEPH_AUTH_CEPHX 0x2 + + /********************************************* * message layer */ @@ -90,12 +100,12 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout); /* client <-> monitor */ #define CEPH_MSG_MON_MAP 4 #define CEPH_MSG_MON_GET_MAP 5 -#define CEPH_MSG_CLIENT_MOUNT 10 -#define CEPH_MSG_CLIENT_MOUNT_ACK 11 #define CEPH_MSG_STATFS 13 #define CEPH_MSG_STATFS_REPLY 14 #define CEPH_MSG_MON_SUBSCRIBE 15 #define CEPH_MSG_MON_SUBSCRIBE_ACK 16 +#define CEPH_MSG_AUTH 17 +#define CEPH_MSG_AUTH_REPLY 18 /* client <-> mds */ #define CEPH_MSG_MDS_MAP 21 diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/ceph_strings.c index 90d19d9d8d8..8e4be6a80c6 100644 --- a/fs/ceph/ceph_strings.c +++ b/fs/ceph/ceph_strings.c @@ -3,6 +3,19 @@ */ #include "types.h" +const char *ceph_entity_type_name(int type) +{ + switch (type) { + case CEPH_ENTITY_TYPE_MDS: return "mds"; + case CEPH_ENTITY_TYPE_OSD: return "osd"; + case CEPH_ENTITY_TYPE_MON: return "mon"; + case CEPH_ENTITY_TYPE_CLIENT: return "client"; + case CEPH_ENTITY_TYPE_ADMIN: return "admin"; + case CEPH_ENTITY_TYPE_AUTH: return "auth"; + default: return "unknown"; + } +} + const char *ceph_osd_op_name(int op) { switch (op) { diff --git a/fs/ceph/decode.h b/fs/ceph/decode.h index a382aecc55b..10de8489624 100644 --- a/fs/ceph/decode.h +++ b/fs/ceph/decode.h @@ -98,6 +98,7 @@ static inline void ceph_encode_addr(struct ceph_entity_addr *a) static inline void ceph_decode_addr(struct ceph_entity_addr *a) { a->in_addr.ss_family = ntohs(a->in_addr.ss_family); + WARN_ON(a->in_addr.ss_family == 512); } /* @@ -123,6 +124,11 @@ static inline void ceph_encode_8(void **p, u8 v) *(u8 *)*p = v; (*p)++; } +static inline void ceph_encode_copy(void **p, const void *s, int len) +{ + memcpy(*p, s, len); + *p += len; +} /* * filepath, string encoders diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 69feeb1c981..8a285158aec 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -8,6 +8,7 @@ #include "super.h" #include "messenger.h" #include "decode.h" +#include "auth.h" /* * A cluster of MDS (metadata server) daemons is responsible for @@ -274,8 +275,12 @@ void ceph_put_mds_session(struct ceph_mds_session *s) { dout("mdsc put_session %p %d -> %d\n", s, atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1); - if (atomic_dec_and_test(&s->s_ref)) + if (atomic_dec_and_test(&s->s_ref)) { + if (s->s_authorizer) + s->s_mdsc->client->monc.auth->ops->destroy_authorizer( + s->s_mdsc->client->monc.auth, s->s_authorizer); kfree(s); + } } /* @@ -2777,9 +2782,15 @@ void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg) ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad); ceph_decode_copy(&p, &fsid, sizeof(fsid)); - if (ceph_fsid_compare(&fsid, &mdsc->client->monc.monmap->fsid)) { - pr_err("got mdsmap with wrong fsid\n"); - return; + if (mdsc->client->monc.have_fsid) { + if (ceph_fsid_compare(&fsid, + &mdsc->client->monc.monmap->fsid)) { + pr_err("got mdsmap with wrong fsid\n"); + return; + } + } else { + ceph_fsid_set(&mdsc->client->monc.monmap->fsid, &fsid); + mdsc->client->monc.have_fsid = true; } epoch = ceph_decode_32(&p); maplen = ceph_decode_32(&p); @@ -2895,10 +2906,60 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) ceph_msg_put(msg); } +/* + * authentication + */ +static int get_authorizer(struct ceph_connection *con, + void **buf, int *len, int *proto, + void **reply_buf, int *reply_len, int force_new) +{ + struct ceph_mds_session *s = con->private; + struct ceph_mds_client *mdsc = s->s_mdsc; + struct ceph_auth_client *ac = mdsc->client->monc.auth; + int ret = 0; + + if (force_new && s->s_authorizer) { + ac->ops->destroy_authorizer(ac, s->s_authorizer); + s->s_authorizer = NULL; + } + if (s->s_authorizer == NULL) { + if (ac->ops->create_authorizer) { + ret = ac->ops->create_authorizer( + ac, CEPH_ENTITY_TYPE_MDS, + &s->s_authorizer, + &s->s_authorizer_buf, + &s->s_authorizer_buf_len, + &s->s_authorizer_reply_buf, + &s->s_authorizer_reply_buf_len); + if (ret) + return ret; + } + } + + *proto = ac->protocol; + *buf = s->s_authorizer_buf; + *len = s->s_authorizer_buf_len; + *reply_buf = s->s_authorizer_reply_buf; + *reply_len = s->s_authorizer_reply_buf_len; + return 0; +} + + +static int verify_authorizer_reply(struct ceph_connection *con, int len) +{ + struct ceph_mds_session *s = con->private; + struct ceph_mds_client *mdsc = s->s_mdsc; + struct ceph_auth_client *ac = mdsc->client->monc.auth; + + return ac->ops->verify_authorizer_reply(ac, s->s_authorizer, len); +} + const static struct ceph_connection_operations mds_con_ops = { .get = con_get, .put = con_put, .dispatch = dispatch, + .get_authorizer = get_authorizer, + .verify_authorizer_reply = verify_authorizer_reply, .peer_reset = peer_reset, .alloc_msg = ceph_alloc_msg, .alloc_middle = ceph_alloc_middle, diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 7c439488cfa..9faa1b2f79a 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -100,6 +100,10 @@ struct ceph_mds_session { struct ceph_connection s_con; + struct ceph_authorizer *s_authorizer; + void *s_authorizer_buf, *s_authorizer_reply_buf; + size_t s_authorizer_buf_len, s_authorizer_reply_buf_len; + /* protected by s_cap_lock */ spinlock_t s_cap_lock; u32 s_cap_gen; /* inc each time we get mds stale msg */ diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c index d8a6a56a157..0b1674854b2 100644 --- a/fs/ceph/messenger.c +++ b/fs/ceph/messenger.c @@ -550,6 +550,27 @@ static void prepare_write_keepalive(struct ceph_connection *con) * Connection negotiation. */ +static void prepare_connect_authorizer(struct ceph_connection *con) +{ + void *auth_buf; + int auth_len = 0; + int auth_protocol = 0; + + if (con->ops->get_authorizer) + con->ops->get_authorizer(con, &auth_buf, &auth_len, + &auth_protocol, &con->auth_reply_buf, + &con->auth_reply_buf_len, + con->auth_retry); + + con->out_connect.authorizer_protocol = cpu_to_le32(auth_protocol); + con->out_connect.authorizer_len = cpu_to_le32(auth_len); + + con->out_kvec[con->out_kvec_left].iov_base = auth_buf; + con->out_kvec[con->out_kvec_left].iov_len = auth_len; + con->out_kvec_left++; + con->out_kvec_bytes += auth_len; +} + /* * We connected to a peer and are saying hello. */ @@ -592,6 +613,7 @@ static void prepare_write_connect(struct ceph_messenger *msgr, dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con, con->connect_seq, global_seq, proto); + con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT); con->out_connect.connect_seq = cpu_to_le32(con->connect_seq); con->out_connect.global_seq = cpu_to_le32(global_seq); @@ -611,6 +633,8 @@ static void prepare_write_connect(struct ceph_messenger *msgr, con->out_kvec_cur = con->out_kvec; con->out_more = 0; set_bit(WRITE_PENDING, &con->state); + + prepare_connect_authorizer(con); } @@ -777,6 +801,13 @@ static void prepare_read_connect(struct ceph_connection *con) con->in_base_pos = 0; } +static void prepare_read_connect_retry(struct ceph_connection *con) +{ + dout("prepare_read_connect_retry %p\n", con); + con->in_base_pos = strlen(CEPH_BANNER) + sizeof(con->actual_peer_addr) + + sizeof(con->peer_addr_for_me); +} + static void prepare_read_ack(struct ceph_connection *con) { dout("prepare_read_ack %p\n", con); @@ -853,9 +884,14 @@ static int read_partial_connect(struct ceph_connection *con) ret = read_partial(con, &to, sizeof(con->in_reply), &con->in_reply); if (ret <= 0) goto out; + ret = read_partial(con, &to, le32_to_cpu(con->in_reply.authorizer_len), + con->auth_reply_buf); + if (ret <= 0) + goto out; - dout("read_partial_connect %p connect_seq = %u, global_seq = %u\n", - con, le32_to_cpu(con->in_reply.connect_seq), + dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n", + con, (int)con->in_reply.tag, + le32_to_cpu(con->in_reply.connect_seq), le32_to_cpu(con->in_reply.global_seq)); out: return ret; @@ -1051,6 +1087,20 @@ static int process_connect(struct ceph_connection *con) set_bit(CLOSED, &con->state); /* in case there's queued work */ return -1; + case CEPH_MSGR_TAG_BADAUTHORIZER: + con->auth_retry++; + dout("process_connect %p got BADAUTHORIZER attempt %d\n", con, + con->auth_retry); + if (con->auth_retry == 2) { + con->error_msg = "connect authorization failure"; + reset_connection(con); + set_bit(CLOSED, &con->state); + return -1; + } + con->auth_retry = 1; + prepare_write_connect(con->msgr, con, 0); + prepare_read_connect_retry(con); + break; case CEPH_MSGR_TAG_RESETSESSION: /* diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h index 4bd85c36308..f9c9f648730 100644 --- a/fs/ceph/messenger.h +++ b/fs/ceph/messenger.h @@ -26,6 +26,12 @@ struct ceph_connection_operations { /* handle an incoming message. */ void (*dispatch) (struct ceph_connection *con, struct ceph_msg *m); + /* authorize an outgoing connection */ + int (*get_authorizer) (struct ceph_connection *con, + void **buf, int *len, int *proto, + void **reply_buf, int *reply_len, int force_new); + int (*verify_authorizer_reply) (struct ceph_connection *con, int len); + /* protocol version mismatch */ void (*bad_proto) (struct ceph_connection *con); @@ -144,6 +150,10 @@ struct ceph_connection { attempt for this connection, client */ u32 peer_global_seq; /* peer's global seq for this connection */ + int auth_retry; /* true if we need a newer authorizer */ + void *auth_reply_buf; /* where to put the authorizer reply */ + int auth_reply_buf_len; + /* out queue */ struct mutex out_mutex; struct list_head out_queue; diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c index 95b76e761e1..017d5aef883 100644 --- a/fs/ceph/mon_client.c +++ b/fs/ceph/mon_client.c @@ -6,6 +6,7 @@ #include "mon_client.h" #include "super.h" +#include "auth.h" #include "decode.h" /* @@ -38,6 +39,10 @@ struct ceph_monmap *ceph_monmap_decode(void *p, void *end) struct ceph_fsid fsid; u32 epoch, num_mon; u16 version; + u32 len; + + ceph_decode_32_safe(&p, end, len, bad); + ceph_decode_need(&p, end, len, bad); dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p)); @@ -95,8 +100,10 @@ static void __close_session(struct ceph_mon_client *monc) { if (monc->con) { dout("__close_session closing mon%d\n", monc->cur_mon); + ceph_con_revoke(monc->con, monc->m_auth); ceph_con_close(monc->con); monc->cur_mon = -1; + ceph_auth_reset(monc->auth); } } @@ -106,6 +113,7 @@ static void __close_session(struct ceph_mon_client *monc) static int __open_session(struct ceph_mon_client *monc) { char r; + int ret; if (monc->cur_mon < 0) { get_random_bytes(&r, 1); @@ -121,6 +129,15 @@ static int __open_session(struct ceph_mon_client *monc) monc->con->peer_name.num = cpu_to_le64(monc->cur_mon); ceph_con_open(monc->con, &monc->monmap->mon_inst[monc->cur_mon].addr); + + /* initiatiate authentication handshake */ + ret = ceph_auth_build_hello(monc->auth, + monc->m_auth->front.iov_base, + monc->m_auth->front_max); + monc->m_auth->front.iov_len = ret; + monc->m_auth->hdr.front_len = cpu_to_le32(ret); + ceph_msg_get(monc->m_auth); /* keep our ref */ + ceph_con_send(monc->con, monc->m_auth); } else { dout("open_session mon%d already open\n", monc->cur_mon); } @@ -139,7 +156,7 @@ static void __schedule_delayed(struct ceph_mon_client *monc) { unsigned delay; - if (monc->cur_mon < 0 || monc->want_mount || __sub_expired(monc)) + if (monc->cur_mon < 0 || __sub_expired(monc)) delay = 10 * HZ; else delay = 20 * HZ; @@ -161,7 +178,7 @@ static void __send_subscribe(struct ceph_mon_client *monc) struct ceph_mon_subscribe_item *i; void *p, *end; - msg = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 64, 0, 0, NULL); + msg = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, 0, 0, NULL); if (!msg) return; @@ -173,7 +190,7 @@ static void __send_subscribe(struct ceph_mon_client *monc) if (monc->want_next_osdmap) { dout("__send_subscribe to 'osdmap' %u\n", (unsigned)monc->have_osdmap); - ceph_encode_32(&p, 2); + ceph_encode_32(&p, 3); ceph_encode_string(&p, end, "osdmap", 6); i = p; i->have = cpu_to_le64(monc->have_osdmap); @@ -181,13 +198,18 @@ static void __send_subscribe(struct ceph_mon_client *monc) p += sizeof(*i); monc->want_next_osdmap = 2; /* requested */ } else { - ceph_encode_32(&p, 1); + ceph_encode_32(&p, 2); } ceph_encode_string(&p, end, "mdsmap", 6); i = p; i->have = cpu_to_le64(monc->have_mdsmap); i->onetime = 0; p += sizeof(*i); + ceph_encode_string(&p, end, "monmap", 6); + i = p; + i->have = 0; + i->onetime = 0; + p += sizeof(*i); msg->front.iov_len = p - msg->front.iov_base; msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); @@ -256,7 +278,7 @@ void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc) mutex_unlock(&monc->mutex); } - +#if 0 /* * mount */ @@ -264,12 +286,8 @@ static void __request_mount(struct ceph_mon_client *monc) { struct ceph_msg *msg; struct ceph_client_mount *h; - int err; dout("__request_mount\n"); - err = __open_session(monc); - if (err) - return; msg = ceph_msg_new(CEPH_MSG_CLIENT_MOUNT, sizeof(*h), 0, 0, NULL); if (IS_ERR(msg)) return; @@ -279,8 +297,12 @@ static void __request_mount(struct ceph_mon_client *monc) h->monhdr.session_mon_tid = 0; ceph_con_send(monc->con, msg); } +#endif -int ceph_monc_request_mount(struct ceph_mon_client *monc) +/* + * + */ +int ceph_monc_open_session(struct ceph_mon_client *monc) { if (!monc->con) { monc->con = kmalloc(sizeof(*monc->con), GFP_KERNEL); @@ -292,12 +314,14 @@ int ceph_monc_request_mount(struct ceph_mon_client *monc) } mutex_lock(&monc->mutex); - __request_mount(monc); + __open_session(monc); __schedule_delayed(monc); mutex_unlock(&monc->mutex); return 0; } +#if 0 + /* * The monitor responds with mount ack indicate mount success. The * included client ticket allows the client to talk to MDSs and OSDs. @@ -372,9 +396,65 @@ out: mutex_unlock(&monc->mutex); wake_up(&client->mount_wq); } +#endif + +/* + * The monitor responds with mount ack indicate mount success. The + * included client ticket allows the client to talk to MDSs and OSDs. + */ +static void ceph_monc_handle_map(struct ceph_mon_client *monc, struct ceph_msg *msg) +{ + struct ceph_client *client = monc->client; + struct ceph_monmap *monmap = NULL, *old = monc->monmap; + void *p, *end; + + mutex_lock(&monc->mutex); + + dout("handle_monmap\n"); + p = msg->front.iov_base; + end = p + msg->front.iov_len; + + monmap = ceph_monmap_decode(p, end); + if (IS_ERR(monmap)) { + pr_err("problem decoding monmap, %d\n", + (int)PTR_ERR(monmap)); + return; + } + if (monc->have_fsid && + ceph_fsid_compare(&monmap->fsid, &monc->monmap->fsid)) { + print_hex_dump(KERN_ERR, "monmap->fsid: ", DUMP_PREFIX_NONE, 16, 1, + (void *)&monmap->fsid, 16, 0); + print_hex_dump(KERN_ERR, "monc->monmap->fsid: ", DUMP_PREFIX_NONE, 16, 1, + (void *)&monc->monmap->fsid, 16, 0); + + pr_err("fsid mismatch, got a previous map with different fsid"); + kfree(monmap); + return; + } + + client->monc.monmap = monmap; + client->monc.have_fsid = true; + kfree(old); + + mutex_unlock(&monc->mutex); + wake_up(&client->mount_wq); +} + +/* + * init client info after authentication + */ +static void __init_authenticated_client(struct ceph_mon_client *monc) +{ + struct ceph_client *client = monc->client; + client->signed_ticket = NULL; + client->signed_ticket_len = 0; + client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT; + client->msgr->inst.name.num = monc->auth->global_id; + ceph_debugfs_client_init(client); +} /* * statfs @@ -414,12 +494,8 @@ static int send_statfs(struct ceph_mon_client *monc, { struct ceph_msg *msg; struct ceph_mon_statfs *h; - int err; dout("send_statfs tid %llu\n", req->tid); - err = __open_session(monc); - if (err) - return err; msg = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), 0, 0, NULL); if (IS_ERR(msg)) return PTR_ERR(msg); @@ -514,17 +590,14 @@ static void delayed_work(struct work_struct *work) dout("monc delayed_work\n"); mutex_lock(&monc->mutex); - if (monc->want_mount) { - __request_mount(monc); + if (monc->hunting) { + __close_session(monc); + __open_session(monc); /* continue hunting */ } else { - if (monc->hunting) { - __close_session(monc); - __open_session(monc); /* continue hunting */ - } else { - ceph_con_keepalive(monc->con); - } + ceph_con_keepalive(monc->con); + if (monc->auth->ops->is_authenticated(monc->auth)) + __send_subscribe(monc); } - __send_subscribe(monc); __schedule_delayed(monc); mutex_unlock(&monc->mutex); } @@ -555,6 +628,7 @@ static int build_initial_monmap(struct ceph_mon_client *monc) monc->monmap->mon_inst[i].name.num = cpu_to_le64(i); } monc->monmap->num_mon = num_mon; + monc->have_fsid = false; /* release addr memory */ kfree(args->mon_addr); @@ -579,21 +653,37 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) monc->con = NULL; + /* authentication */ + monc->auth = ceph_auth_init(cl->mount_args->name, + cl->mount_args->secret); + if (IS_ERR(monc->auth)) + return PTR_ERR(monc->auth); + monc->auth->want_keys = + CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON | + CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS; + /* msg pools */ - err = ceph_msgpool_init(&monc->msgpool_mount_ack, 4096, 1, false); - if (err < 0) - goto out; err = ceph_msgpool_init(&monc->msgpool_subscribe_ack, sizeof(struct ceph_mon_subscribe_ack), 1, false); if (err < 0) - goto out; + goto out_monmap; err = ceph_msgpool_init(&monc->msgpool_statfs_reply, sizeof(struct ceph_mon_statfs_reply), 0, false); if (err < 0) - goto out; + goto out_pool1; + err = ceph_msgpool_init(&monc->msgpool_auth_reply, 4096, 1, false); + if (err < 0) + goto out_pool2; + + monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, 0, 0, NULL); + if (IS_ERR(monc->m_auth)) { + err = PTR_ERR(monc->m_auth); + monc->m_auth = NULL; + goto out_pool3; + } monc->cur_mon = -1; - monc->hunting = false; /* not really */ + monc->hunting = true; monc->sub_renew_after = jiffies; monc->sub_sent = 0; @@ -605,7 +695,16 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) monc->have_mdsmap = 0; monc->have_osdmap = 0; monc->want_next_osdmap = 1; - monc->want_mount = true; + return 0; + +out_pool3: + ceph_msgpool_destroy(&monc->msgpool_auth_reply); +out_pool2: + ceph_msgpool_destroy(&monc->msgpool_subscribe_ack); +out_pool1: + ceph_msgpool_destroy(&monc->msgpool_statfs_reply); +out_monmap: + kfree(monc->monmap); out: return err; } @@ -624,14 +723,44 @@ void ceph_monc_stop(struct ceph_mon_client *monc) } mutex_unlock(&monc->mutex); - ceph_msgpool_destroy(&monc->msgpool_mount_ack); + ceph_auth_destroy(monc->auth); + + ceph_msg_put(monc->m_auth); ceph_msgpool_destroy(&monc->msgpool_subscribe_ack); ceph_msgpool_destroy(&monc->msgpool_statfs_reply); + ceph_msgpool_destroy(&monc->msgpool_auth_reply); kfree(monc->monmap); } +static void handle_auth_reply(struct ceph_mon_client *monc, + struct ceph_msg *msg) +{ + int ret; + + mutex_lock(&monc->mutex); + ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base, + msg->front.iov_len, + monc->m_auth->front.iov_base, + monc->m_auth->front_max); + if (ret < 0) { + monc->client->mount_err = ret; + wake_up(&monc->client->mount_wq); + } else if (ret > 0) { + monc->m_auth->front.iov_len = ret; + monc->m_auth->hdr.front_len = cpu_to_le32(ret); + ceph_msg_get(monc->m_auth); /* keep our ref */ + ceph_con_send(monc->con, monc->m_auth); + } else if (monc->auth->ops->is_authenticated(monc->auth)) { + dout("authenticated, starting session\n"); + __init_authenticated_client(monc); + __send_subscribe(monc); + __resend_statfs(monc); + } + mutex_unlock(&monc->mutex); +} + /* * handle incoming message */ @@ -644,8 +773,8 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) return; switch (type) { - case CEPH_MSG_CLIENT_MOUNT_ACK: - handle_mount_ack(monc, msg); + case CEPH_MSG_AUTH_REPLY: + handle_auth_reply(monc, msg); break; case CEPH_MSG_MON_SUBSCRIBE_ACK: @@ -656,6 +785,10 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) handle_statfs_reply(monc, msg); break; + case CEPH_MSG_MON_MAP: + ceph_monc_handle_map(monc, msg); + break; + case CEPH_MSG_MDS_MAP: ceph_mdsc_handle_map(&monc->client->mdsc, msg); break; @@ -682,12 +815,12 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con, int front = le32_to_cpu(hdr->front_len); switch (type) { - case CEPH_MSG_CLIENT_MOUNT_ACK: - return ceph_msgpool_get(&monc->msgpool_mount_ack, front); case CEPH_MSG_MON_SUBSCRIBE_ACK: return ceph_msgpool_get(&monc->msgpool_subscribe_ack, front); case CEPH_MSG_STATFS_REPLY: return ceph_msgpool_get(&monc->msgpool_statfs_reply, front); + case CEPH_MSG_AUTH_REPLY: + return ceph_msgpool_get(&monc->msgpool_auth_reply, front); } return ceph_alloc_msg(con, hdr); } @@ -717,10 +850,7 @@ static void mon_fault(struct ceph_connection *con) if (!monc->hunting) { /* start hunting */ monc->hunting = true; - if (__open_session(monc) == 0) { - __send_subscribe(monc); - __resend_statfs(monc); - } + __open_session(monc); } else { /* already hunting, let's wait a bit */ __schedule_delayed(monc); diff --git a/fs/ceph/mon_client.h b/fs/ceph/mon_client.h index 9f6db45bf46..c75b53302ec 100644 --- a/fs/ceph/mon_client.h +++ b/fs/ceph/mon_client.h @@ -9,6 +9,7 @@ struct ceph_client; struct ceph_mount_args; +struct ceph_auth_client; /* * The monitor map enumerates the set of all monitors. @@ -58,23 +59,26 @@ struct ceph_mon_client { struct mutex mutex; struct delayed_work delayed_work; + struct ceph_auth_client *auth; + struct ceph_msg *m_auth; + bool hunting; int cur_mon; /* last monitor i contacted */ unsigned long sub_sent, sub_renew_after; struct ceph_connection *con; + bool have_fsid; /* msg pools */ - struct ceph_msgpool msgpool_mount_ack; struct ceph_msgpool msgpool_subscribe_ack; struct ceph_msgpool msgpool_statfs_reply; + struct ceph_msgpool msgpool_auth_reply; /* pending statfs requests */ struct radix_tree_root statfs_request_tree; int num_statfs_requests; u64 last_tid; - /* mds/osd map or mount requests */ - bool want_mount; + /* mds/osd map */ int want_next_osdmap; /* 1 = want, 2 = want+asked */ u32 have_osdmap, have_mdsmap; @@ -101,11 +105,11 @@ extern int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 have); extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc); -extern int ceph_monc_request_mount(struct ceph_mon_client *monc); - extern int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf); +extern int ceph_monc_open_session(struct ceph_mon_client *monc); + #endif diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h index 8e3ea2eb1bf..c758e8f8f71 100644 --- a/fs/ceph/msgr.h +++ b/fs/ceph/msgr.h @@ -21,7 +21,7 @@ * whenever the wire protocol changes. try to keep this string length * constant. */ -#define CEPH_BANNER "ceph v023" +#define CEPH_BANNER "ceph v024" #define CEPH_BANNER_MAX_LEN 30 @@ -46,11 +46,16 @@ struct ceph_entity_name { __le64 num; } __attribute__ ((packed)); -#define CEPH_ENTITY_TYPE_MON 1 -#define CEPH_ENTITY_TYPE_MDS 2 -#define CEPH_ENTITY_TYPE_OSD 3 -#define CEPH_ENTITY_TYPE_CLIENT 4 -#define CEPH_ENTITY_TYPE_ADMIN 5 +#define CEPH_ENTITY_TYPE_MON 0x01 +#define CEPH_ENTITY_TYPE_MDS 0x02 +#define CEPH_ENTITY_TYPE_OSD 0x04 +#define CEPH_ENTITY_TYPE_CLIENT 0x08 +#define CEPH_ENTITY_TYPE_ADMIN 0x10 +#define CEPH_ENTITY_TYPE_AUTH 0x20 + +#define CEPH_ENTITY_TYPE_ANY 0xFF + +extern const char *ceph_entity_type_name(int type); /* * entity_addr -- network address @@ -94,6 +99,7 @@ struct ceph_entity_inst { #define CEPH_MSGR_TAG_ACK 8 /* message ack */ #define CEPH_MSGR_TAG_KEEPALIVE 9 /* just a keepalive byte! */ #define CEPH_MSGR_TAG_BADPROTOVER 10 /* bad protocol version */ +#define CEPH_MSGR_TAG_BADAUTHORIZER 11 /* bad authorizer */ /* @@ -104,6 +110,8 @@ struct ceph_msg_connect { __le32 global_seq; /* count connections initiated by this host */ __le32 connect_seq; /* count connections initiated in this session */ __le32 protocol_version; + __le32 authorizer_protocol; + __le32 authorizer_len; __u8 flags; /* CEPH_MSG_CONNECT_* */ } __attribute__ ((packed)); @@ -112,6 +120,7 @@ struct ceph_msg_connect_reply { __le32 global_seq; __le32 connect_seq; __le32 protocol_version; + __le32 authorizer_len; __u8 flags; } __attribute__ ((packed)); diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c index 0a16c4f951f..ca0ee68c322 100644 --- a/fs/ceph/osd_client.c +++ b/fs/ceph/osd_client.c @@ -11,6 +11,7 @@ #include "osd_client.h" #include "messenger.h" #include "decode.h" +#include "auth.h" const static struct ceph_connection_operations osd_con_ops; @@ -331,6 +332,7 @@ static struct ceph_osd *create_osd(struct ceph_osd_client *osdc) osd->o_con.private = osd; osd->o_con.ops = &osd_con_ops; osd->o_con.peer_name.type = CEPH_ENTITY_TYPE_OSD; + return osd; } @@ -880,9 +882,15 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) /* verify fsid */ ceph_decode_need(&p, end, sizeof(fsid), bad); ceph_decode_copy(&p, &fsid, sizeof(fsid)); - if (ceph_fsid_compare(&fsid, &osdc->client->monc.monmap->fsid)) { - pr_err("got osdmap with wrong fsid, ignoring\n"); - return; + if (osdc->client->monc.have_fsid) { + if (ceph_fsid_compare(&fsid, + &osdc->client->monc.monmap->fsid)) { + pr_err("got osdmap with wrong fsid, ignoring\n"); + return; + } + } else { + ceph_fsid_set(&osdc->client->monc.monmap->fsid, &fsid); + osdc->client->monc.have_fsid = true; } down_write(&osdc->map_sem); @@ -1302,10 +1310,59 @@ static void put_osd_con(struct ceph_connection *con) put_osd(osd); } +/* + * authentication + */ +static int get_authorizer(struct ceph_connection *con, + void **buf, int *len, int *proto, + void **reply_buf, int *reply_len, int force_new) +{ + struct ceph_osd *o = con->private; + struct ceph_osd_client *osdc = o->o_osdc; + struct ceph_auth_client *ac = osdc->client->monc.auth; + int ret = 0; + + if (force_new && o->o_authorizer) { + ac->ops->destroy_authorizer(ac, o->o_authorizer); + o->o_authorizer = NULL; + } + if (o->o_authorizer == NULL) { + ret = ac->ops->create_authorizer( + ac, CEPH_ENTITY_TYPE_OSD, + &o->o_authorizer, + &o->o_authorizer_buf, + &o->o_authorizer_buf_len, + &o->o_authorizer_reply_buf, + &o->o_authorizer_reply_buf_len); + if (ret) + return ret; + } + + *proto = ac->protocol; + *buf = o->o_authorizer_buf; + *len = o->o_authorizer_buf_len; + *reply_buf = o->o_authorizer_reply_buf; + *reply_len = o->o_authorizer_reply_buf_len; + return 0; +} + + +static int verify_authorizer_reply(struct ceph_connection *con, int len) +{ + struct ceph_osd *o = con->private; + struct ceph_osd_client *osdc = o->o_osdc; + struct ceph_auth_client *ac = osdc->client->monc.auth; + + return ac->ops->verify_authorizer_reply(ac, o->o_authorizer, len); +} + + const static struct ceph_connection_operations osd_con_ops = { .get = get_osd_con, .put = put_osd_con, .dispatch = dispatch, + .get_authorizer = get_authorizer, + .verify_authorizer_reply = verify_authorizer_reply, .alloc_msg = alloc_msg, .fault = osd_reset, .alloc_middle = ceph_alloc_middle, diff --git a/fs/ceph/osd_client.h b/fs/ceph/osd_client.h index 766c8dc80af..3d4ae6595aa 100644 --- a/fs/ceph/osd_client.h +++ b/fs/ceph/osd_client.h @@ -13,6 +13,7 @@ struct ceph_msg; struct ceph_snap_context; struct ceph_osd_request; struct ceph_osd_client; +struct ceph_authorizer; /* * completion callback for async writepages @@ -29,6 +30,9 @@ struct ceph_osd { struct rb_node o_node; struct ceph_connection o_con; struct list_head o_requests; + struct ceph_authorizer *o_authorizer; + void *o_authorizer_buf, *o_authorizer_reply_buf; + size_t o_authorizer_buf_len, o_authorizer_reply_buf_len; }; /* an in-flight request */ diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h index fb23ff9297c..12bfb2f7c27 100644 --- a/fs/ceph/rados.h +++ b/fs/ceph/rados.h @@ -157,7 +157,6 @@ struct ceph_eversion { #define CEPH_OSD_OP_MODE_WR 0x2000 #define CEPH_OSD_OP_MODE_RMW 0x3000 #define CEPH_OSD_OP_MODE_SUB 0x4000 -#define CEPH_OSD_OP_MODE_EXEC 0x8000 #define CEPH_OSD_OP_TYPE 0x0f00 #define CEPH_OSD_OP_TYPE_LOCK 0x0100 @@ -285,6 +284,7 @@ enum { CEPH_OSD_FLAG_BALANCE_READS = 256, CEPH_OSD_FLAG_PARALLELEXEC = 512, /* execute op in parallel */ CEPH_OSD_FLAG_PGOP = 1024, /* pg op, no object */ + CEPH_OSD_FLAG_EXEC = 2048, /* op may exec */ }; enum { diff --git a/fs/ceph/super.c b/fs/ceph/super.c index fe0a5962a08..c901395ae8a 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -128,6 +128,8 @@ static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt) seq_puts(m, ",noasyncreaddir"); if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT)) seq_printf(m, ",snapdirname=%s", args->snapdir_name); + if (args->name) + seq_printf(m, ",name=%s", args->name); if (args->secret) seq_puts(m, ",secret="); return 0; @@ -224,12 +226,12 @@ const char *ceph_msg_type_name(int type) switch (type) { case CEPH_MSG_SHUTDOWN: return "shutdown"; case CEPH_MSG_PING: return "ping"; + case CEPH_MSG_AUTH: return "auth"; + case CEPH_MSG_AUTH_REPLY: return "auth_reply"; case CEPH_MSG_MON_MAP: return "mon_map"; case CEPH_MSG_MON_GET_MAP: return "mon_get_map"; case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe"; case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack"; - case CEPH_MSG_CLIENT_MOUNT: return "client_mount"; - case CEPH_MSG_CLIENT_MOUNT_ACK: return "client_mount_ack"; case CEPH_MSG_STATFS: return "statfs"; case CEPH_MSG_STATFS_REPLY: return "statfs_reply"; case CEPH_MSG_MDS_MAP: return "mds_map"; @@ -267,6 +269,7 @@ enum { Opt_last_int, /* int args above */ Opt_snapdirname, + Opt_name, Opt_secret, Opt_last_string, /* string args above */ @@ -293,6 +296,7 @@ static match_table_t arg_tokens = { {Opt_readdir_max_entries, "readdir_max_entries=%d"}, /* int args above */ {Opt_snapdirname, "snapdirname=%s"}, + {Opt_name, "name=%s"}, {Opt_secret, "secret=%s"}, /* string args above */ {Opt_ip, "ip=%s"}, @@ -407,6 +411,11 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options, argstr[0].to-argstr[0].from, GFP_KERNEL); break; + case Opt_name: + args->name = kstrndup(argstr[0].from, + argstr[0].to-argstr[0].from, + GFP_KERNEL); + break; case Opt_secret: args->secret = kstrndup(argstr[0].from, argstr[0].to-argstr[0].from, @@ -476,6 +485,8 @@ static void destroy_mount_args(struct ceph_mount_args *args) dout("destroy_mount_args %p\n", args); kfree(args->snapdir_name); args->snapdir_name = NULL; + kfree(args->name); + args->name = NULL; kfree(args->secret); args->secret = NULL; kfree(args); @@ -657,27 +668,23 @@ static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt, client->msgr->nocrc = ceph_test_opt(client, NOCRC); } - /* send mount request, and wait for mon, mds, and osd maps */ - err = ceph_monc_request_mount(&client->monc); + /* open session, and wait for mon, mds, and osd maps */ + err = ceph_monc_open_session(&client->monc); if (err < 0) goto out; - while (!have_mon_map(client) && !client->mount_err) { + while (!have_mon_map(client)) { err = -EIO; if (timeout && time_after_eq(jiffies, started + timeout)) goto out; /* wait */ - dout("mount waiting for mount\n"); - err = wait_event_interruptible_timeout(client->mount_wq, - client->mount_err || have_mon_map(client), + dout("mount waiting for mon_map\n"); + err = wait_event_interruptible_timeout(client->mount_wq, /* FIXME */ + have_mon_map(client), timeout); if (err == -EINTR || err == -ERESTARTSYS) goto out; - if (client->mount_err) { - err = client->mount_err; - goto out; - } } dout("mount opening root\n"); @@ -795,7 +802,6 @@ static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client) client->backing_dev_info.ra_pages = (client->mount_args->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_SHIFT; - err = bdi_register_dev(&client->backing_dev_info, sb->s_dev); return err; } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 8aa1ffba6c0..e0e8130959b 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -61,6 +61,7 @@ struct ceph_mount_args { int max_readdir; /* max readdir size */ int osd_timeout; char *snapdir_name; /* default ".snap" */ + char *name; char *secret; int cap_release_safety; }; @@ -75,6 +76,7 @@ struct ceph_mount_args { #define CEPH_MSG_MAX_DATA_LEN (16*1024*1024) #define CEPH_SNAPDIRNAME_DEFAULT ".snap" +#define CEPH_AUTH_NAME_DEFAULT "guest" /* * Delay telling the MDS we no longer want caps, in case we reopen -- cgit v1.2.3 From 03c677e1d189ff62891d9f278c55bb798a418b81 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 20 Nov 2009 15:14:15 -0800 Subject: ceph: reset msgr backoff during open, not after successful handshake Reset the backoff delay when we reopen the connection, so that the delays for any initial connection problems are reasonable. We were resetting only after a successful handshake, which was of limited utility. Signed-off-by: Sage Weil --- fs/ceph/messenger.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs/ceph/messenger.c') diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c index 0b1674854b2..45cec31fdf5 100644 --- a/fs/ceph/messenger.c +++ b/fs/ceph/messenger.c @@ -348,6 +348,7 @@ void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr) set_bit(OPENING, &con->state); clear_bit(CLOSED, &con->state); memcpy(&con->peer_addr, addr, sizeof(*addr)); + con->delay = 0; /* reset backoff memory */ queue_con(con); } @@ -1162,8 +1163,6 @@ static int process_connect(struct ceph_connection *con) con->connect_seq); WARN_ON(con->connect_seq != le32_to_cpu(con->in_reply.connect_seq)); - - con->delay = 0; /* reset backoff memory */ prepare_read_tag(con); break; -- cgit v1.2.3 From b6c1d5b81ea0841ae9d3ce2cda319ab986b081cf Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 7 Dec 2009 12:17:17 -0800 Subject: ceph: simplify ceph_buffer interface We never allocate the ceph_buffer and buffer separtely, so use a single constructor. Disallow put on NULL buffer; make the caller check. Signed-off-by: Sage Weil --- fs/ceph/buffer.c | 23 +++++++++++++++++++---- fs/ceph/buffer.h | 20 +++----------------- fs/ceph/inode.c | 11 +++++++---- fs/ceph/messenger.c | 2 +- fs/ceph/xattr.c | 8 +++++--- 5 files changed, 35 insertions(+), 29 deletions(-) (limited to 'fs/ceph/messenger.c') diff --git a/fs/ceph/buffer.c b/fs/ceph/buffer.c index 847c5da9a0d..2576bd452cb 100644 --- a/fs/ceph/buffer.c +++ b/fs/ceph/buffer.c @@ -2,23 +2,38 @@ #include "ceph_debug.h" #include "buffer.h" -struct ceph_buffer *ceph_buffer_new(gfp_t gfp) +struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp) { struct ceph_buffer *b; b = kmalloc(sizeof(*b), gfp); if (!b) return NULL; + + b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN); + if (b->vec.iov_base) { + b->is_vmalloc = false; + } else { + b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL); + if (!b->vec.iov_base) { + kfree(b); + return NULL; + } + b->is_vmalloc = true; + } + kref_init(&b->kref); - b->vec.iov_base = NULL; - b->vec.iov_len = 0; - b->alloc_len = 0; + b->alloc_len = len; + b->vec.iov_len = len; + dout("buffer_new %p\n", b); return b; } void ceph_buffer_release(struct kref *kref) { struct ceph_buffer *b = container_of(kref, struct ceph_buffer, kref); + + dout("buffer_release %p\n", b); if (b->vec.iov_base) { if (b->is_vmalloc) vfree(b->vec.iov_base); diff --git a/fs/ceph/buffer.h b/fs/ceph/buffer.h index 3f541a13094..47b9514c5bb 100644 --- a/fs/ceph/buffer.h +++ b/fs/ceph/buffer.h @@ -20,8 +20,8 @@ struct ceph_buffer { bool is_vmalloc; }; -struct ceph_buffer *ceph_buffer_new(gfp_t gfp); -int ceph_buffer_alloc(struct ceph_buffer *b, int len, gfp_t gfp); +extern struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp); +extern void ceph_buffer_release(struct kref *kref); static inline struct ceph_buffer *ceph_buffer_get(struct ceph_buffer *b) { @@ -29,23 +29,9 @@ static inline struct ceph_buffer *ceph_buffer_get(struct ceph_buffer *b) return b; } -void ceph_buffer_release(struct kref *kref); - static inline void ceph_buffer_put(struct ceph_buffer *b) { - if (b) - kref_put(&b->kref, ceph_buffer_release); -} - -static inline struct ceph_buffer *ceph_buffer_new_alloc(int len, gfp_t gfp) -{ - struct ceph_buffer *b = ceph_buffer_new(gfp); - - if (b && ceph_buffer_alloc(b, len, gfp) < 0) { - ceph_buffer_put(b); - b = NULL; - } - return b; + kref_put(&b->kref, ceph_buffer_release); } #endif diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 074ee42bd34..db684686f48 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -383,8 +383,10 @@ void ceph_destroy_inode(struct inode *inode) } __ceph_destroy_xattrs(ci); - ceph_buffer_put(ci->i_xattrs.blob); - ceph_buffer_put(ci->i_xattrs.prealloc_blob); + if (ci->i_xattrs.blob) + ceph_buffer_put(ci->i_xattrs.blob); + if (ci->i_xattrs.prealloc_blob) + ceph_buffer_put(ci->i_xattrs.prealloc_blob); kmem_cache_free(ceph_inode_cachep, ci); } @@ -526,7 +528,7 @@ static int fill_inode(struct inode *inode, * bytes are the xattr count). */ if (iinfo->xattr_len > 4) { - xattr_blob = ceph_buffer_new_alloc(iinfo->xattr_len, GFP_NOFS); + xattr_blob = ceph_buffer_new(iinfo->xattr_len, GFP_NOFS); if (!xattr_blob) pr_err("fill_inode ENOMEM xattr blob %d bytes\n", iinfo->xattr_len); @@ -715,7 +717,8 @@ no_change: err = 0; out: - ceph_buffer_put(xattr_blob); + if (xattr_blob) + ceph_buffer_put(xattr_blob); return err; } diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c index 45cec31fdf5..bf762107b3d 100644 --- a/fs/ceph/messenger.c +++ b/fs/ceph/messenger.c @@ -2047,7 +2047,7 @@ int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg) BUG_ON(!middle_len); BUG_ON(msg->middle); - msg->middle = ceph_buffer_new_alloc(middle_len, GFP_NOFS); + msg->middle = ceph_buffer_new(middle_len, GFP_NOFS); if (!msg->middle) return -ENOMEM; return 0; diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 04769a3ab83..37d6ce64569 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -482,7 +482,8 @@ void __ceph_build_xattrs_blob(struct ceph_inode_info *ci) ci->i_xattrs.prealloc_blob->vec.iov_len = dest - ci->i_xattrs.prealloc_blob->vec.iov_base; - ceph_buffer_put(ci->i_xattrs.blob); + if (ci->i_xattrs.blob) + ceph_buffer_put(ci->i_xattrs.blob); ci->i_xattrs.blob = ci->i_xattrs.prealloc_blob; ci->i_xattrs.prealloc_blob = NULL; ci->i_xattrs.dirty = false; @@ -745,11 +746,12 @@ retry: spin_unlock(&inode->i_lock); dout(" preaallocating new blob size=%d\n", required_blob_size); - blob = ceph_buffer_new_alloc(required_blob_size, GFP_NOFS); + blob = ceph_buffer_new(required_blob_size, GFP_NOFS); if (!blob) goto out; spin_lock(&inode->i_lock); - ceph_buffer_put(ci->i_xattrs.prealloc_blob); + if (ci->i_xattrs.prealloc_blob) + ceph_buffer_put(ci->i_xattrs.prealloc_blob); ci->i_xattrs.prealloc_blob = blob; goto retry; } -- cgit v1.2.3 From c2e552e76e2c6907ca50cd9a4b747a2e2e8c615e Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 7 Dec 2009 15:55:05 -0800 Subject: ceph: use kref for ceph_msg Signed-off-by: Sage Weil --- fs/ceph/messenger.c | 47 ++++++++++++++++++----------------------------- fs/ceph/messenger.h | 13 ++++++++----- fs/ceph/msgpool.c | 2 +- 3 files changed, 27 insertions(+), 35 deletions(-) (limited to 'fs/ceph/messenger.c') diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c index bf762107b3d..b0571b01b19 100644 --- a/fs/ceph/messenger.c +++ b/fs/ceph/messenger.c @@ -1958,7 +1958,7 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, m = kmalloc(sizeof(*m), GFP_NOFS); if (m == NULL) goto out; - atomic_set(&m->nref, 1); + kref_init(&m->kref); INIT_LIST_HEAD(&m->list_head); m->hdr.type = cpu_to_le16(type); @@ -2070,34 +2070,23 @@ void ceph_msg_kfree(struct ceph_msg *m) /* * Drop a msg ref. Destroy as needed. */ -void ceph_msg_put(struct ceph_msg *m) -{ - dout("ceph_msg_put %p %d -> %d\n", m, atomic_read(&m->nref), - atomic_read(&m->nref)-1); - if (atomic_read(&m->nref) <= 0) { - pr_err("bad ceph_msg_put on %p %llu %d=%s %d+%d\n", - m, le64_to_cpu(m->hdr.seq), - le16_to_cpu(m->hdr.type), - ceph_msg_type_name(le16_to_cpu(m->hdr.type)), - le32_to_cpu(m->hdr.front_len), - le32_to_cpu(m->hdr.data_len)); - WARN_ON(1); - } - if (atomic_dec_and_test(&m->nref)) { - dout("ceph_msg_put last one on %p\n", m); - WARN_ON(!list_empty(&m->list_head)); - - /* drop middle, data, if any */ - if (m->middle) { - ceph_buffer_put(m->middle); - m->middle = NULL; - } - m->nr_pages = 0; - m->pages = NULL; +void ceph_msg_last_put(struct kref *kref) +{ + struct ceph_msg *m = container_of(kref, struct ceph_msg, kref); - if (m->pool) - ceph_msgpool_put(m->pool, m); - else - ceph_msg_kfree(m); + dout("ceph_msg_put last one on %p\n", m); + WARN_ON(!list_empty(&m->list_head)); + + /* drop middle, data, if any */ + if (m->middle) { + ceph_buffer_put(m->middle); + m->middle = NULL; } + m->nr_pages = 0; + m->pages = NULL; + + if (m->pool) + ceph_msgpool_put(m->pool, m); + else + ceph_msg_kfree(m); } diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h index f9c9f648730..981b7c08ad8 100644 --- a/fs/ceph/messenger.h +++ b/fs/ceph/messenger.h @@ -1,6 +1,7 @@ #ifndef __FS_CEPH_MESSENGER_H #define __FS_CEPH_MESSENGER_H +#include #include #include #include @@ -85,7 +86,7 @@ struct ceph_msg { struct page **pages; /* data payload. NOT OWNER. */ unsigned nr_pages; /* size of page array */ struct list_head list_head; - atomic_t nref; + struct kref kref; bool front_is_vmalloc; bool more_to_follow; int front_max; @@ -243,11 +244,13 @@ extern int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg); static inline struct ceph_msg *ceph_msg_get(struct ceph_msg *msg) { - dout("ceph_msg_get %p %d -> %d\n", msg, atomic_read(&msg->nref), - atomic_read(&msg->nref)+1); - atomic_inc(&msg->nref); + kref_get(&msg->kref); return msg; } -extern void ceph_msg_put(struct ceph_msg *msg); +extern void ceph_msg_last_put(struct kref *kref); +static inline void ceph_msg_put(struct ceph_msg *msg) +{ + kref_put(&msg->kref, ceph_msg_last_put); +} #endif diff --git a/fs/ceph/msgpool.c b/fs/ceph/msgpool.c index 7599b338207..ad5482c0267 100644 --- a/fs/ceph/msgpool.c +++ b/fs/ceph/msgpool.c @@ -165,7 +165,7 @@ void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg) { spin_lock(&pool->lock); if (pool->num < pool->min) { - ceph_msg_get(msg); /* retake a single ref */ + kref_set(&msg->kref, 1); /* retake a single ref */ list_add(&msg->list_head, &pool->msgs); pool->num++; dout("msgpool_put %p reclaim %p, now %d/%d\n", pool, msg, -- cgit v1.2.3 From c86a2930ccbd90d77c54d04b5c2bbec95b989e40 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 14 Dec 2009 14:04:30 -0800 Subject: ceph: carry explicit msg reference for currently sending message Carry a ceph_msg reference for connection->out_msg. This will allow us to make out_sent optional. Signed-off-by: Sage Weil --- fs/ceph/messenger.c | 22 ++++++++++++++++++---- fs/ceph/messenger.h | 1 + 2 files changed, 19 insertions(+), 4 deletions(-) (limited to 'fs/ceph/messenger.c') diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c index b0571b01b19..96fd556a780 100644 --- a/fs/ceph/messenger.c +++ b/fs/ceph/messenger.c @@ -322,7 +322,10 @@ static void reset_connection(struct ceph_connection *con) con->connect_seq = 0; con->out_seq = 0; - con->out_msg = NULL; + if (con->out_msg) { + ceph_msg_put(con->out_msg); + con->out_msg = NULL; + } con->in_seq = 0; mutex_unlock(&con->out_mutex); } @@ -423,7 +426,7 @@ static void prepare_write_message_footer(struct ceph_connection *con, int v) con->out_kvec_bytes += sizeof(m->footer); con->out_kvec_left++; con->out_more = m->more_to_follow; - con->out_msg = NULL; /* we're done with this one */ + con->out_msg_done = true; } /* @@ -436,6 +439,7 @@ static void prepare_write_message(struct ceph_connection *con) con->out_kvec_bytes = 0; con->out_kvec_is_msg = true; + con->out_msg_done = false; /* Sneak an ack in there first? If we can get it into the same * TCP packet that's a good thing. */ @@ -452,8 +456,9 @@ static void prepare_write_message(struct ceph_connection *con) /* move message to sending/sent list */ m = list_first_entry(&con->out_queue, struct ceph_msg, list_head); + con->out_msg = m; + ceph_msg_get(m); list_move_tail(&m->list_head, &con->out_sent); - con->out_msg = m; /* we don't bother taking a reference here. */ m->hdr.seq = cpu_to_le64(++con->out_seq); @@ -1521,6 +1526,12 @@ more_kvec: /* msg pages? */ if (con->out_msg) { + if (con->out_msg_done) { + ceph_msg_put(con->out_msg); + con->out_msg = NULL; /* we're done with this one */ + goto do_next; + } + ret = write_partial_msg_pages(con); if (ret == 1) goto more_kvec; /* we need to send the footer, too! */ @@ -1533,6 +1544,7 @@ more_kvec: } } +do_next: if (!test_bit(CONNECTING, &con->state)) { /* is anything else pending? */ if (!list_empty(&con->out_queue)) { @@ -1923,8 +1935,10 @@ void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg) list_del_init(&msg->list_head); ceph_msg_put(msg); msg->hdr.seq = 0; - if (con->out_msg == msg) + if (con->out_msg == msg) { + ceph_msg_put(con->out_msg); con->out_msg = NULL; + } if (con->out_kvec_is_msg) { con->out_skip = con->out_kvec_bytes; con->out_kvec_is_msg = false; diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h index 981b7c08ad8..eff5cb5197f 100644 --- a/fs/ceph/messenger.h +++ b/fs/ceph/messenger.h @@ -182,6 +182,7 @@ struct ceph_connection { /* message out temps */ struct ceph_msg *out_msg; /* sending message (== tail of out_sent) */ + bool out_msg_done; struct ceph_msg_pos out_msg_pos; struct kvec out_kvec[8], /* sending header/footer data */ -- cgit v1.2.3 From 5e095e8b40b0402ad3bcadc5b8d84c38b26c30b2 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 14 Dec 2009 14:30:34 -0800 Subject: ceph: plug msg leak in con_fault Signed-off-by: Sage Weil --- fs/ceph/messenger.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs/ceph/messenger.c') diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c index 96fd556a780..98519bd33f0 100644 --- a/fs/ceph/messenger.c +++ b/fs/ceph/messenger.c @@ -1434,8 +1434,9 @@ no_data: */ static void process_message(struct ceph_connection *con) { - struct ceph_msg *msg = con->in_msg; + struct ceph_msg *msg; + msg = con->in_msg; con->in_msg = NULL; /* if first message, set peer_name */ @@ -1810,7 +1811,11 @@ static void ceph_fault(struct ceph_connection *con) clear_bit(BUSY, &con->state); /* to avoid an improbable race */ con_close_socket(con); - con->in_msg = NULL; + + if (con->in_msg) { + ceph_msg_put(con->in_msg); + con->in_msg = NULL; + } /* If there are no messages in the queue, place the connection * in a STANDBY state (i.e., don't try to reconnect just yet). */ -- cgit v1.2.3 From 92ac41d0a4ab26fb68d3f841332e5d1f15d79123 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 14 Dec 2009 14:56:56 -0800 Subject: ceph: detect lossy state of connection The server indicates whether a connection is lossy; set our LOSSYTX bit appropriately. Do not set lossy bit on outgoing connections. Signed-off-by: Sage Weil --- fs/ceph/messenger.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs/ceph/messenger.c') diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c index 98519bd33f0..986d8fb9c57 100644 --- a/fs/ceph/messenger.c +++ b/fs/ceph/messenger.c @@ -625,8 +625,6 @@ static void prepare_write_connect(struct ceph_messenger *msgr, con->out_connect.global_seq = cpu_to_le32(global_seq); con->out_connect.protocol_version = cpu_to_le32(proto); con->out_connect.flags = 0; - if (test_bit(LOSSYTX, &con->state)) - con->out_connect.flags = CEPH_MSG_CONNECT_LOSSY; if (!after_banner) { con->out_kvec_left = 0; @@ -1168,6 +1166,10 @@ static int process_connect(struct ceph_connection *con) con->connect_seq); WARN_ON(con->connect_seq != le32_to_cpu(con->in_reply.connect_seq)); + + if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY) + set_bit(LOSSYTX, &con->state); + prepare_read_tag(con); break; -- cgit v1.2.3 From b3d1dbbdd5670d8a9fb01f7dfb1cac522ff6795a Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 14 Dec 2009 14:58:11 -0800 Subject: ceph: don't save sent messages on lossy connections For lossy connections we drop all state on socket errors, so there is no reason to keep sent ceph_msg's around. Signed-off-by: Sage Weil --- fs/ceph/messenger.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'fs/ceph/messenger.c') diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c index 986d8fb9c57..d5eef76a253 100644 --- a/fs/ceph/messenger.c +++ b/fs/ceph/messenger.c @@ -453,12 +453,16 @@ static void prepare_write_message(struct ceph_connection *con) con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack); } - /* move message to sending/sent list */ m = list_first_entry(&con->out_queue, struct ceph_msg, list_head); con->out_msg = m; - ceph_msg_get(m); - list_move_tail(&m->list_head, &con->out_sent); + if (test_bit(LOSSYTX, &con->state)) { + /* put message on sent list */ + ceph_msg_get(m); + list_move_tail(&m->list_head, &con->out_sent); + } else { + list_del_init(&m->list_head); + } m->hdr.seq = cpu_to_le64(++con->out_seq); -- cgit v1.2.3 From 9ec7cab14e6de732d4e7c355fe67c5810c32c758 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 14 Dec 2009 15:13:47 -0800 Subject: ceph: hex dump corrupt server data to KERN_DEBUG Also, print fsid using standard format, NOT hex dump. Signed-off-by: Sage Weil --- fs/ceph/caps.c | 1 + fs/ceph/mds_client.c | 4 ++++ fs/ceph/mdsmap.c | 4 ++++ fs/ceph/messenger.c | 20 ++++++++++++++++++++ fs/ceph/messenger.h | 2 ++ fs/ceph/mon_client.c | 2 ++ fs/ceph/osd_client.c | 2 ++ fs/ceph/osdmap.c | 3 +++ fs/ceph/snap.c | 1 + fs/ceph/super.c | 9 ++------- 10 files changed, 41 insertions(+), 7 deletions(-) (limited to 'fs/ceph/messenger.c') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 9b9ce143ac1..dfb509f5354 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2681,6 +2681,7 @@ done: bad: pr_err("ceph_handle_caps: corrupt message\n"); + ceph_msg_dump(msg); return; } diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 739093f281d..29a93fe35f8 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1650,6 +1650,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) return; if (msg->front.iov_len < sizeof(*head)) { pr_err("mdsc_handle_reply got corrupt (short) reply\n"); + ceph_msg_dump(msg); return; } @@ -1740,6 +1741,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) mutex_lock(&session->s_mutex); if (err < 0) { pr_err("mdsc_handle_reply got corrupt reply mds%d\n", mds); + ceph_msg_dump(msg); goto out_err; } @@ -1929,6 +1931,7 @@ static void handle_session(struct ceph_mds_session *session, bad: pr_err("mdsc_handle_session corrupt message mds%d len %d\n", mds, (int)msg->front.iov_len); + ceph_msg_dump(msg); return; } @@ -2394,6 +2397,7 @@ out: bad: pr_err("corrupt lease message\n"); + ceph_msg_dump(msg); } void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index cad8d25861e..c4c498e6dfe 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -49,6 +49,7 @@ int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m) struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) { struct ceph_mdsmap *m; + const void *start = *p; int i, j, n; int err = -EINVAL; u16 version; @@ -154,6 +155,9 @@ badmem: err = -ENOMEM; bad: pr_err("corrupt mdsmap\n"); + print_hex_dump(KERN_DEBUG, "mdsmap: ", + DUMP_PREFIX_OFFSET, 16, 1, + start, end - start, true); ceph_mdsmap_destroy(m); return ERR_PTR(-EINVAL); } diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c index d5eef76a253..b10f88c5670 100644 --- a/fs/ceph/messenger.c +++ b/fs/ceph/messenger.c @@ -2115,3 +2115,23 @@ void ceph_msg_last_put(struct kref *kref) else ceph_msg_kfree(m); } + +void ceph_msg_dump(struct ceph_msg *msg) +{ + pr_debug("msg_dump %p (front_max %d nr_pages %d)\n", msg, + msg->front_max, msg->nr_pages); + print_hex_dump(KERN_DEBUG, "header: ", + DUMP_PREFIX_OFFSET, 16, 1, + &msg->hdr, sizeof(msg->hdr), true); + print_hex_dump(KERN_DEBUG, " front: ", + DUMP_PREFIX_OFFSET, 16, 1, + msg->front.iov_base, msg->front.iov_len, true); + if (msg->middle) + print_hex_dump(KERN_DEBUG, "middle: ", + DUMP_PREFIX_OFFSET, 16, 1, + msg->middle->vec.iov_base, + msg->middle->vec.iov_len, true); + print_hex_dump(KERN_DEBUG, "footer: ", + DUMP_PREFIX_OFFSET, 16, 1, + &msg->footer, sizeof(msg->footer), true); +} diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h index eff5cb5197f..e04c214b4f6 100644 --- a/fs/ceph/messenger.h +++ b/fs/ceph/messenger.h @@ -254,4 +254,6 @@ static inline void ceph_msg_put(struct ceph_msg *msg) kref_put(&msg->kref, ceph_msg_last_put); } +extern void ceph_msg_dump(struct ceph_msg *msg); + #endif diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c index a76da5e6dbd..775a9c029c5 100644 --- a/fs/ceph/mon_client.c +++ b/fs/ceph/mon_client.c @@ -242,6 +242,7 @@ static void handle_subscribe_ack(struct ceph_mon_client *monc, return; bad: pr_err("got corrupt subscribe-ack msg\n"); + ceph_msg_dump(msg); } /* @@ -364,6 +365,7 @@ static void handle_statfs_reply(struct ceph_mon_client *monc, bad: pr_err("corrupt statfs reply, no tid\n"); + ceph_msg_dump(msg); } /* diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c index 63482ef3de0..4bfe880d53c 100644 --- a/fs/ceph/osd_client.c +++ b/fs/ceph/osd_client.c @@ -773,6 +773,7 @@ bad: pr_err("corrupt osd_op_reply got %d %d expected %d\n", (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len), (int)sizeof(*rhead)); + ceph_msg_dump(msg); } @@ -964,6 +965,7 @@ done: bad: pr_err("osdc handle_map corrupt msg\n"); + ceph_msg_dump(msg); up_write(&osdc->map_sem); return; } diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c index be5318aa771..8c8ffe5ef7d 100644 --- a/fs/ceph/osdmap.c +++ b/fs/ceph/osdmap.c @@ -726,6 +726,9 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, bad: pr_err("corrupt inc osdmap epoch %d off %d (%p of %p-%p)\n", epoch, (int)(*p - start), *p, start, end); + print_hex_dump(KERN_DEBUG, "osdmap: ", + DUMP_PREFIX_OFFSET, 16, 1, + start, end - start, true); if (newcrush) crush_destroy(newcrush); return ERR_PTR(err); diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 52f46a1208f..dcf18d92130 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -877,6 +877,7 @@ split_skip_inode: bad: pr_err("corrupt snap message from mds%d\n", mds); + ceph_msg_dump(msg); out: if (locked_rwsem) up_write(&mdsc->snap_rwsem); diff --git a/fs/ceph/super.c b/fs/ceph/super.c index a828943296c..6d02a166f8f 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -602,13 +602,8 @@ int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid) { if (client->have_fsid) { if (ceph_fsid_compare(&client->fsid, fsid)) { - print_hex_dump(KERN_ERR, "this fsid: ", - DUMP_PREFIX_NONE, 16, 1, - (void *)fsid, 16, 0); - print_hex_dump(KERN_ERR, " old fsid: ", - DUMP_PREFIX_NONE, 16, 1, - (void *)&client->fsid, 16, 0); - pr_err("fsid mismatch\n"); + pr_err("bad fsid, had " FSID_FORMAT " got " FSID_FORMAT, + PR_FSID(&client->fsid), PR_FSID(fsid)); return -1; } } else { -- cgit v1.2.3 From cf3e5c409b5d66ec66207092a3f7e3e2c42c0f3f Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 11 Dec 2009 09:48:05 -0800 Subject: ceph: plug leak of incoming message during connection fault/close If we explicitly close a connection, or there is a socket error, we need to drop any partially received message. Signed-off-by: Sage Weil --- fs/ceph/messenger.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'fs/ceph/messenger.c') diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c index b10f88c5670..b12604ef184 100644 --- a/fs/ceph/messenger.c +++ b/fs/ceph/messenger.c @@ -320,6 +320,11 @@ static void reset_connection(struct ceph_connection *con) ceph_msg_remove_list(&con->out_queue); ceph_msg_remove_list(&con->out_sent); + if (con->in_msg) { + ceph_msg_put(con->in_msg); + con->in_msg = NULL; + } + con->connect_seq = 0; con->out_seq = 0; if (con->out_msg) { @@ -1288,7 +1293,7 @@ static int read_partial_message(struct ceph_connection *con) con->in_msg = con->ops->alloc_msg(con, &con->in_hdr); if (!con->in_msg) { /* skip this message */ - dout("alloc_msg returned NULL, skipping message\n"); + pr_err("alloc_msg returned NULL, skipping message\n"); con->in_base_pos = -front_len - middle_len - data_len - sizeof(m->footer); con->in_tag = CEPH_MSGR_TAG_READY; @@ -1327,7 +1332,7 @@ static int read_partial_message(struct ceph_connection *con) if (con->ops->alloc_middle) ret = con->ops->alloc_middle(con, m); if (ret < 0) { - dout("alloc_middle failed, skipping payload\n"); + pr_err("alloc_middle fail skipping payload\n"); con->in_base_pos = -middle_len - data_len - sizeof(m->footer); ceph_msg_put(con->in_msg); @@ -1498,6 +1503,7 @@ more: set_bit(CONNECTING, &con->state); clear_bit(NEGOTIATING, &con->state); + BUG_ON(con->in_msg); con->in_tag = CEPH_MSGR_TAG_READY; dout("try_write initiating connect on %p new state %lu\n", con, con->state); -- cgit v1.2.3 From 169e16ce816ca417286daf1db25de424a9d65a0c Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Wed, 16 Dec 2009 14:22:17 -0800 Subject: ceph: remove unaccessible code Signed-off-by: Yehuda Sadeh --- fs/ceph/messenger.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'fs/ceph/messenger.c') diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c index b12604ef184..2e4e9773c46 100644 --- a/fs/ceph/messenger.c +++ b/fs/ceph/messenger.c @@ -1531,10 +1531,6 @@ more_kvec: ret = write_partial_kvec(con); if (ret <= 0) goto done; - if (ret < 0) { - dout("try_write write_partial_kvec err %d\n", ret); - goto done; - } } /* msg pages? */ -- cgit v1.2.3 From ec302645f4a9bd9ec757c30d185557e1c0972c1a Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 22 Dec 2009 10:43:42 -0800 Subject: ceph: use connection mutex to protect read and write stages Use a single mutex (previously out_mutex) to protect both read and write activity from concurrent ceph_con_* calls. Drop the mutex when doing callbacks to avoid nested locking (the callback may need to call something like ceph_con_close). Signed-off-by: Sage Weil --- fs/ceph/messenger.c | 49 +++++++++++++++++++++++++++++++------------------ fs/ceph/messenger.h | 3 ++- 2 files changed, 33 insertions(+), 19 deletions(-) (limited to 'fs/ceph/messenger.c') diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c index 2e4e9773c46..c03b4185c14 100644 --- a/fs/ceph/messenger.c +++ b/fs/ceph/messenger.c @@ -316,7 +316,6 @@ static void reset_connection(struct ceph_connection *con) { /* reset connection, out_queue, msg_ and connect_seq */ /* discard existing out_queue and msg_seq */ - mutex_lock(&con->out_mutex); ceph_msg_remove_list(&con->out_queue); ceph_msg_remove_list(&con->out_sent); @@ -332,7 +331,6 @@ static void reset_connection(struct ceph_connection *con) con->out_msg = NULL; } con->in_seq = 0; - mutex_unlock(&con->out_mutex); } /* @@ -343,7 +341,9 @@ void ceph_con_close(struct ceph_connection *con) dout("con_close %p peer %s\n", con, pr_addr(&con->peer_addr.in_addr)); set_bit(CLOSED, &con->state); /* in case there's queued work */ clear_bit(STANDBY, &con->state); /* avoid connect_seq bump */ + mutex_lock(&con->mutex); reset_connection(con); + mutex_unlock(&con->mutex); queue_con(con); } @@ -392,7 +392,7 @@ void ceph_con_init(struct ceph_messenger *msgr, struct ceph_connection *con) memset(con, 0, sizeof(*con)); atomic_set(&con->nref, 1); con->msgr = msgr; - mutex_init(&con->out_mutex); + mutex_init(&con->mutex); INIT_LIST_HEAD(&con->out_queue); INIT_LIST_HEAD(&con->out_sent); INIT_DELAYED_WORK(&con->work, con_work); @@ -571,11 +571,13 @@ static void prepare_connect_authorizer(struct ceph_connection *con) int auth_len = 0; int auth_protocol = 0; + mutex_unlock(&con->mutex); if (con->ops->get_authorizer) con->ops->get_authorizer(con, &auth_buf, &auth_len, &auth_protocol, &con->auth_reply_buf, &con->auth_reply_buf_len, con->auth_retry); + mutex_lock(&con->mutex); con->out_connect.authorizer_protocol = cpu_to_le32(auth_protocol); con->out_connect.authorizer_len = cpu_to_le32(auth_len); @@ -1094,10 +1096,13 @@ static int process_connect(struct ceph_connection *con) le32_to_cpu(con->out_connect.protocol_version), le32_to_cpu(con->in_reply.protocol_version)); con->error_msg = "protocol version mismatch"; - if (con->ops->bad_proto) - con->ops->bad_proto(con); reset_connection(con); set_bit(CLOSED, &con->state); /* in case there's queued work */ + + mutex_unlock(&con->mutex); + if (con->ops->bad_proto) + con->ops->bad_proto(con); + mutex_lock(&con->mutex); return -1; case CEPH_MSGR_TAG_BADAUTHORIZER: @@ -1133,9 +1138,11 @@ static int process_connect(struct ceph_connection *con) prepare_read_connect(con); /* Tell ceph about it. */ + mutex_unlock(&con->mutex); pr_info("reset on %s%lld\n", ENTITY_NAME(con->peer_name)); if (con->ops->peer_reset) con->ops->peer_reset(con); + mutex_lock(&con->mutex); break; case CEPH_MSGR_TAG_RETRY_SESSION: @@ -1221,7 +1228,6 @@ static void process_ack(struct ceph_connection *con) u64 ack = le64_to_cpu(con->in_temp_ack); u64 seq; - mutex_lock(&con->out_mutex); while (!list_empty(&con->out_sent)) { m = list_first_entry(&con->out_sent, struct ceph_msg, list_head); @@ -1232,7 +1238,6 @@ static void process_ack(struct ceph_connection *con) le16_to_cpu(m->hdr.type), m); ceph_msg_remove(m); } - mutex_unlock(&con->out_mutex); prepare_read_tag(con); } @@ -1366,8 +1371,10 @@ static int read_partial_message(struct ceph_connection *con) /* find pages for data payload */ want = calc_pages_for(data_off & ~PAGE_MASK, data_len); ret = -1; + mutex_unlock(&con->mutex); if (con->ops->prepare_pages) ret = con->ops->prepare_pages(con, m, want); + mutex_lock(&con->mutex); if (ret < 0) { dout("%p prepare_pages failed, skipping payload\n", m); con->in_base_pos = -data_len - sizeof(m->footer); @@ -1454,9 +1461,8 @@ static void process_message(struct ceph_connection *con) if (con->peer_name.type == 0) con->peer_name = msg->hdr.src.name; - mutex_lock(&con->out_mutex); con->in_seq++; - mutex_unlock(&con->out_mutex); + mutex_unlock(&con->mutex); dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n", msg, le64_to_cpu(msg->hdr.seq), @@ -1467,6 +1473,8 @@ static void process_message(struct ceph_connection *con) le32_to_cpu(msg->hdr.data_len), con->in_front_crc, con->in_middle_crc, con->in_data_crc); con->ops->dispatch(con, msg); + + mutex_lock(&con->mutex); prepare_read_tag(con); } @@ -1483,7 +1491,7 @@ static int try_write(struct ceph_connection *con) dout("try_write start %p state %lu nref %d\n", con, con->state, atomic_read(&con->nref)); - mutex_lock(&con->out_mutex); + mutex_lock(&con->mutex); more: dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes); @@ -1576,7 +1584,7 @@ do_next: done: ret = 0; out: - mutex_unlock(&con->out_mutex); + mutex_unlock(&con->mutex); dout("try_write done on %p\n", con); return ret; } @@ -1600,6 +1608,8 @@ static int try_read(struct ceph_connection *con) dout("try_read start on %p\n", con); msgr = con->msgr; + mutex_lock(&con->mutex); + more: dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag, con->in_base_pos); @@ -1693,6 +1703,7 @@ more: done: ret = 0; out: + mutex_unlock(&con->mutex); dout("try_read done on %p\n", con); return ret; @@ -1818,6 +1829,8 @@ static void ceph_fault(struct ceph_connection *con) clear_bit(BUSY, &con->state); /* to avoid an improbable race */ + mutex_lock(&con->mutex); + con_close_socket(con); if (con->in_msg) { @@ -1827,24 +1840,24 @@ static void ceph_fault(struct ceph_connection *con) /* If there are no messages in the queue, place the connection * in a STANDBY state (i.e., don't try to reconnect just yet). */ - mutex_lock(&con->out_mutex); if (list_empty(&con->out_queue) && !con->out_keepalive_pending) { dout("fault setting STANDBY\n"); set_bit(STANDBY, &con->state); - mutex_unlock(&con->out_mutex); + mutex_unlock(&con->mutex); goto out; } /* Requeue anything that hasn't been acked, and retry after a * delay. */ list_splice_init(&con->out_sent, &con->out_queue); - mutex_unlock(&con->out_mutex); if (con->delay == 0) con->delay = BASE_DELAY_INTERVAL; else if (con->delay < MAX_DELAY_INTERVAL) con->delay *= 2; + mutex_unlock(&con->mutex); + /* explicitly schedule work to try to reconnect again later. */ dout("fault queueing %p delay %lu\n", con, con->delay); con->ops->get(con); @@ -1920,7 +1933,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg) msg->hdr.dst_erank = con->peer_addr.erank; /* queue */ - mutex_lock(&con->out_mutex); + mutex_lock(&con->mutex); BUG_ON(!list_empty(&msg->list_head)); list_add_tail(&msg->list_head, &con->out_queue); dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg, @@ -1929,7 +1942,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg) le32_to_cpu(msg->hdr.front_len), le32_to_cpu(msg->hdr.middle_len), le32_to_cpu(msg->hdr.data_len)); - mutex_unlock(&con->out_mutex); + mutex_unlock(&con->mutex); /* if there wasn't anything waiting to send before, queue * new work */ @@ -1942,7 +1955,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg) */ void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg) { - mutex_lock(&con->out_mutex); + mutex_lock(&con->mutex); if (!list_empty(&msg->list_head)) { dout("con_revoke %p msg %p\n", con, msg); list_del_init(&msg->list_head); @@ -1959,7 +1972,7 @@ void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg) } else { dout("con_revoke %p msg %p - not queued (sent?)\n", con, msg); } - mutex_unlock(&con->out_mutex); + mutex_unlock(&con->mutex); } /* diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h index e04c214b4f6..94b55de9033 100644 --- a/fs/ceph/messenger.h +++ b/fs/ceph/messenger.h @@ -155,8 +155,9 @@ struct ceph_connection { void *auth_reply_buf; /* where to put the authorizer reply */ int auth_reply_buf_len; + struct mutex mutex; + /* out queue */ - struct mutex out_mutex; struct list_head out_queue; struct list_head out_sent; /* sending or sent but unacked */ u64 out_seq; /* last message queued for send */ -- cgit v1.2.3 From 350b1c32ea58d29e25d63fc25e92dd48f9339546 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 22 Dec 2009 10:45:45 -0800 Subject: ceph: control access to page vector for incoming data When we issue an OSD read, we specify a vector of pages that the data is to be read into. The request may be sent multiple times, to multiple OSDs, if the osdmap changes, which means we can get more than one reply. Only read data into the page vector if the reply is coming from the OSD we last sent the request to. Keep track of which connection is using the vector by taking a reference. If another connection was already using the vector before and a new reply comes in on the right connection, revoke the pages from the other connection. Signed-off-by: Sage Weil --- fs/ceph/messenger.c | 29 +++++++++++++++++++++++++++++ fs/ceph/messenger.h | 2 ++ fs/ceph/osd_client.c | 42 +++++++++++++++++++++++++++++++++--------- fs/ceph/osd_client.h | 4 +++- 4 files changed, 67 insertions(+), 10 deletions(-) (limited to 'fs/ceph/messenger.c') diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c index c03b4185c14..506b638a023 100644 --- a/fs/ceph/messenger.c +++ b/fs/ceph/messenger.c @@ -1975,6 +1975,35 @@ void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg) mutex_unlock(&con->mutex); } +/* + * Revoke a page vector that we may be reading data into + */ +void ceph_con_revoke_pages(struct ceph_connection *con, struct page **pages) +{ + mutex_lock(&con->mutex); + if (con->in_msg && con->in_msg->pages == pages) { + unsigned data_len = le32_to_cpu(con->in_hdr.data_len); + + /* skip rest of message */ + dout("con_revoke_pages %p msg %p pages %p revoked\n", con, + con->in_msg, pages); + if (con->in_msg_pos.data_pos < data_len) + con->in_base_pos = con->in_msg_pos.data_pos - data_len; + else + con->in_base_pos = con->in_base_pos - + sizeof(struct ceph_msg_header) - + sizeof(struct ceph_msg_footer); + con->in_msg->pages = NULL; + ceph_msg_put(con->in_msg); + con->in_msg = NULL; + con->in_tag = CEPH_MSGR_TAG_READY; + } else { + dout("con_revoke_pages %p msg %p pages %p no-op\n", + con, con->in_msg, pages); + } + mutex_unlock(&con->mutex); +} + /* * Queue a keepalive byte to ensure the tcp connection is alive. */ diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h index 94b55de9033..7e2aab1d3ce 100644 --- a/fs/ceph/messenger.h +++ b/fs/ceph/messenger.h @@ -230,6 +230,8 @@ extern void ceph_con_open(struct ceph_connection *con, extern void ceph_con_close(struct ceph_connection *con); extern void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg); extern void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg); +extern void ceph_con_revoke_pages(struct ceph_connection *con, + struct page **pages); extern void ceph_con_keepalive(struct ceph_connection *con); extern struct ceph_connection *ceph_con_get(struct ceph_connection *con); extern void ceph_con_put(struct ceph_connection *con); diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c index a1800fb6323..374f0013956 100644 --- a/fs/ceph/osd_client.c +++ b/fs/ceph/osd_client.c @@ -87,6 +87,13 @@ void ceph_osdc_release_request(struct kref *kref) ceph_msg_put(req->r_request); if (req->r_reply) ceph_msg_put(req->r_reply); + if (req->r_con_filling_pages) { + dout("release_request revoking pages %p from con %p\n", + req->r_pages, req->r_con_filling_pages); + ceph_con_revoke_pages(req->r_con_filling_pages, + req->r_pages); + ceph_con_put(req->r_con_filling_pages); + } if (req->r_own_pages) ceph_release_page_vector(req->r_pages, req->r_num_pages); @@ -687,7 +694,8 @@ static void handle_timeout(struct work_struct *work) * handle osd op reply. either call the callback if it is specified, * or do the completion to wake up the waiting thread. */ -static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg) +static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, + struct ceph_connection *con) { struct ceph_osd_reply_head *rhead = msg->front.iov_base; struct ceph_osd_request *req; @@ -715,6 +723,16 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg) ceph_osdc_get_request(req); flags = le32_to_cpu(rhead->flags); + /* + * if this connection filled our pages, drop our reference now, to + * avoid a (safe but slower) revoke later. + */ + if (req->r_con_filling_pages == con && req->r_pages == msg->pages) { + dout(" got pages, dropping con_filling_pages ref %p\n", con); + req->r_con_filling_pages = NULL; + ceph_con_put(con); + } + if (req->r_reply) { /* * once we see the message has been received, we don't @@ -1007,14 +1025,20 @@ static int prepare_pages(struct ceph_connection *con, struct ceph_msg *m, } dout("prepare_pages tid %llu has %d pages, want %d\n", tid, req->r_num_pages, want); - if (likely(req->r_num_pages >= want && !req->r_prepared_pages)) { - m->pages = req->r_pages; - m->nr_pages = req->r_num_pages; - req->r_reply = m; /* only for duration of read over socket */ - ceph_msg_get(m); - req->r_prepared_pages = 1; - ret = 0; /* success */ + if (unlikely(req->r_num_pages < want)) + goto out; + + if (req->r_con_filling_pages) { + dout("revoking pages %p from old con %p\n", req->r_pages, + req->r_con_filling_pages); + ceph_con_revoke_pages(req->r_con_filling_pages, req->r_pages); + ceph_con_put(req->r_con_filling_pages); } + req->r_con_filling_pages = ceph_con_get(con); + req->r_reply = ceph_msg_get(m); /* for duration of read over socket */ + m->pages = req->r_pages; + m->nr_pages = req->r_num_pages; + ret = 0; /* success */ out: mutex_unlock(&osdc->request_mutex); return ret; @@ -1269,7 +1293,7 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) ceph_osdc_handle_map(osdc, msg); break; case CEPH_MSG_OSD_OPREPLY: - handle_reply(osdc, msg); + handle_reply(osdc, msg, con); break; default: diff --git a/fs/ceph/osd_client.h b/fs/ceph/osd_client.h index 2e4cfd1e9f1..8fef71cc445 100644 --- a/fs/ceph/osd_client.h +++ b/fs/ceph/osd_client.h @@ -43,11 +43,13 @@ struct ceph_osd_request { struct list_head r_osd_item; struct ceph_osd *r_osd; + struct ceph_connection *r_con_filling_pages; + struct ceph_msg *r_request, *r_reply; int r_result; int r_flags; /* any additional flags for the osd */ u32 r_sent; /* >0 if r_request is sending/sent */ - int r_prepared_pages, r_got_reply; + int r_got_reply; int r_num_prealloc_reply; struct ceph_osd_client *r_osdc; -- cgit v1.2.3 From 04a419f908b5291ff7e8ffd7aa351fa0ac0c08af Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 23 Dec 2009 09:30:21 -0800 Subject: ceph: add feature bits to connection handshake (protocol change) Define supported and required feature set. Fail connection if the server requires features we do not support (TAG_FEATURES), or if the server does not support features we require. Signed-off-by: Sage Weil --- fs/ceph/ceph_fs.h | 6 ++++++ fs/ceph/dir.c | 12 ++++++------ fs/ceph/messenger.c | 47 +++++++++++++++++++++++++++++++++++++---------- fs/ceph/msgr.h | 5 ++++- 4 files changed, 53 insertions(+), 17 deletions(-) (limited to 'fs/ceph/messenger.c') diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h index e87dfa6ec8e..db3fed33c4a 100644 --- a/fs/ceph/ceph_fs.h +++ b/fs/ceph/ceph_fs.h @@ -50,6 +50,12 @@ #define CEPH_MAX_MON 31 +/* + * feature bits + */ +#define CEPH_FEATURE_SUPPORTED 0 +#define CEPH_FEATURE_REQUIRED 0 + /* * ceph_file_layout - describe data layout for a file/inode diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index fde839c6123..5107384ee02 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -1135,9 +1135,9 @@ void ceph_dentry_lru_add(struct dentry *dn) { struct ceph_dentry_info *di = ceph_dentry(dn); struct ceph_mds_client *mdsc; - dout("dentry_lru_add %p %p\t%.*s\n", - di, dn, dn->d_name.len, dn->d_name.name); + dout("dentry_lru_add %p %p '%.*s'\n", di, dn, + dn->d_name.len, dn->d_name.name); if (di) { mdsc = &ceph_client(dn->d_sb)->mdsc; spin_lock(&mdsc->dentry_lru_lock); @@ -1151,9 +1151,9 @@ void ceph_dentry_lru_touch(struct dentry *dn) { struct ceph_dentry_info *di = ceph_dentry(dn); struct ceph_mds_client *mdsc; - dout("dentry_lru_touch %p %p\t%.*s\n", - di, dn, dn->d_name.len, dn->d_name.name); + dout("dentry_lru_touch %p %p '%.*s'\n", di, dn, + dn->d_name.len, dn->d_name.name); if (di) { mdsc = &ceph_client(dn->d_sb)->mdsc; spin_lock(&mdsc->dentry_lru_lock); @@ -1167,8 +1167,8 @@ void ceph_dentry_lru_del(struct dentry *dn) struct ceph_dentry_info *di = ceph_dentry(dn); struct ceph_mds_client *mdsc; - dout("dentry_lru_del %p %p\t%.*s\n", - di, dn, dn->d_name.len, dn->d_name.name); + dout("dentry_lru_del %p %p '%.*s'\n", di, dn, + dn->d_name.len, dn->d_name.name); if (di) { mdsc = &ceph_client(dn->d_sb)->mdsc; spin_lock(&mdsc->dentry_lru_lock); diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c index 506b638a023..68052f66428 100644 --- a/fs/ceph/messenger.c +++ b/fs/ceph/messenger.c @@ -631,6 +631,7 @@ static void prepare_write_connect(struct ceph_messenger *msgr, dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con, con->connect_seq, global_seq, proto); + con->out_connect.features = CEPH_FEATURE_SUPPORTED; con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT); con->out_connect.connect_seq = cpu_to_le32(con->connect_seq); con->out_connect.global_seq = cpu_to_le32(global_seq); @@ -1080,15 +1081,37 @@ static int process_banner(struct ceph_connection *con) return 0; } +static void fail_protocol(struct ceph_connection *con) +{ + reset_connection(con); + set_bit(CLOSED, &con->state); /* in case there's queued work */ + + mutex_unlock(&con->mutex); + if (con->ops->bad_proto) + con->ops->bad_proto(con); + mutex_lock(&con->mutex); +} + static int process_connect(struct ceph_connection *con) { + u64 sup_feat = CEPH_FEATURE_SUPPORTED; + u64 req_feat = CEPH_FEATURE_REQUIRED; + u64 server_feat = le64_to_cpu(con->in_reply.features); + dout("process_connect on %p tag %d\n", con, (int)con->in_tag); switch (con->in_reply.tag) { + case CEPH_MSGR_TAG_FEATURES: + pr_err("%s%lld %s feature set mismatch," + " my %llx < server's %llx, missing %llx\n", + ENTITY_NAME(con->peer_name), + pr_addr(&con->peer_addr.in_addr), + sup_feat, server_feat, server_feat & ~sup_feat); + con->error_msg = "missing required protocol features"; + fail_protocol(con); + return -1; + case CEPH_MSGR_TAG_BADPROTOVER: - dout("process_connect got BADPROTOVER my %d != their %d\n", - le32_to_cpu(con->out_connect.protocol_version), - le32_to_cpu(con->in_reply.protocol_version)); pr_err("%s%lld %s protocol version mismatch," " my %d != server's %d\n", ENTITY_NAME(con->peer_name), @@ -1096,13 +1119,7 @@ static int process_connect(struct ceph_connection *con) le32_to_cpu(con->out_connect.protocol_version), le32_to_cpu(con->in_reply.protocol_version)); con->error_msg = "protocol version mismatch"; - reset_connection(con); - set_bit(CLOSED, &con->state); /* in case there's queued work */ - - mutex_unlock(&con->mutex); - if (con->ops->bad_proto) - con->ops->bad_proto(con); - mutex_lock(&con->mutex); + fail_protocol(con); return -1; case CEPH_MSGR_TAG_BADAUTHORIZER: @@ -1173,6 +1190,16 @@ static int process_connect(struct ceph_connection *con) break; case CEPH_MSGR_TAG_READY: + if (req_feat & ~server_feat) { + pr_err("%s%lld %s protocol feature mismatch," + " my required %llx > server's %llx, need %llx\n", + ENTITY_NAME(con->peer_name), + pr_addr(&con->peer_addr.in_addr), + req_feat, server_feat, req_feat & ~server_feat); + con->error_msg = "missing required protocol features"; + fail_protocol(con); + return -1; + } clear_bit(CONNECTING, &con->state); con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq); con->connect_seq++; diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h index e46d8b806de..be83f93182e 100644 --- a/fs/ceph/msgr.h +++ b/fs/ceph/msgr.h @@ -21,7 +21,7 @@ * whenever the wire protocol changes. try to keep this string length * constant. */ -#define CEPH_BANNER "ceph v025" +#define CEPH_BANNER "ceph v026" #define CEPH_BANNER_MAX_LEN 30 @@ -100,12 +100,14 @@ struct ceph_entity_inst { #define CEPH_MSGR_TAG_KEEPALIVE 9 /* just a keepalive byte! */ #define CEPH_MSGR_TAG_BADPROTOVER 10 /* bad protocol version */ #define CEPH_MSGR_TAG_BADAUTHORIZER 11 /* bad authorizer */ +#define CEPH_MSGR_TAG_FEATURES 12 /* insufficient features */ /* * connection negotiation */ struct ceph_msg_connect { + __le64 features; /* supported feature bits */ __le32 host_type; /* CEPH_ENTITY_TYPE_* */ __le32 global_seq; /* count connections initiated by this host */ __le32 connect_seq; /* count connections initiated in this session */ @@ -117,6 +119,7 @@ struct ceph_msg_connect { struct ceph_msg_connect_reply { __u8 tag; + __le64 features; /* feature bits for this session */ __le32 global_seq; __le32 connect_seq; __le32 protocol_version; -- cgit v1.2.3 From 58bb3b374b07a2a43315213f00a48a5ffd6d0915 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 23 Dec 2009 12:12:31 -0800 Subject: ceph: support ceph_pagelist for message payload The ceph_pagelist is a simple list of whole pages, strung together via their lru list_head. It facilitates encoding to a "buffer" of unknown size. Allow its use in place of the ceph_msg page vector. This will be used to fix the huge buffer preallocation woes of MDS reconnection. Signed-off-by: Sage Weil --- fs/ceph/Makefile | 2 +- fs/ceph/messenger.c | 24 ++++++++++++++++++++---- fs/ceph/messenger.h | 1 + fs/ceph/pagelist.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/ceph/pagelist.h | 54 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 130 insertions(+), 5 deletions(-) create mode 100644 fs/ceph/pagelist.c create mode 100644 fs/ceph/pagelist.h (limited to 'fs/ceph/messenger.c') diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile index 827629c8576..47caf2f1b75 100644 --- a/fs/ceph/Makefile +++ b/fs/ceph/Makefile @@ -8,7 +8,7 @@ obj-$(CONFIG_CEPH_FS) += ceph.o ceph-objs := super.o inode.o dir.o file.o addr.o ioctl.o \ export.o caps.o snap.o xattr.o \ - messenger.o msgpool.o buffer.o \ + messenger.o msgpool.o buffer.o pagelist.o \ mds_client.o mdsmap.o \ mon_client.o \ osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \ diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c index 68052f66428..c1106e8360f 100644 --- a/fs/ceph/messenger.c +++ b/fs/ceph/messenger.c @@ -13,6 +13,7 @@ #include "super.h" #include "messenger.h" #include "decode.h" +#include "pagelist.h" /* * Ceph uses the messenger to exchange ceph_msg messages with other @@ -728,6 +729,11 @@ static int write_partial_msg_pages(struct ceph_connection *con) page = msg->pages[con->out_msg_pos.page]; if (crc) kaddr = kmap(page); + } else if (msg->pagelist) { + page = list_first_entry(&msg->pagelist->head, + struct page, lru); + if (crc) + kaddr = kmap(page); } else { page = con->msgr->zero_page; if (crc) @@ -750,7 +756,7 @@ static int write_partial_msg_pages(struct ceph_connection *con) MSG_DONTWAIT | MSG_NOSIGNAL | MSG_MORE); - if (crc && msg->pages) + if (crc && (msg->pages || msg->pagelist)) kunmap(page); if (ret <= 0) @@ -762,6 +768,9 @@ static int write_partial_msg_pages(struct ceph_connection *con) con->out_msg_pos.page_pos = 0; con->out_msg_pos.page++; con->out_msg_pos.did_page_crc = 0; + if (msg->pagelist) + list_move_tail(&page->lru, + &msg->pagelist->head); } } @@ -1051,13 +1060,13 @@ static int process_banner(struct ceph_connection *con) &con->actual_peer_addr) && !(addr_is_blank(&con->actual_peer_addr.in_addr) && con->actual_peer_addr.nonce == con->peer_addr.nonce)) { - pr_err("wrong peer, want %s/%d, " - "got %s/%d, wtf\n", + pr_warning("wrong peer, want %s/%d, " + "got %s/%d\n", pr_addr(&con->peer_addr.in_addr), con->peer_addr.nonce, pr_addr(&con->actual_peer_addr.in_addr), con->actual_peer_addr.nonce); - con->error_msg = "protocol error, wrong peer"; + con->error_msg = "wrong peer at address"; return -1; } @@ -2096,6 +2105,7 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, /* data */ m->nr_pages = calc_pages_for(page_off, page_len); m->pages = pages; + m->pagelist = NULL; dout("ceph_msg_new %p page %d~%d -> %d\n", m, page_off, page_len, m->nr_pages); @@ -2181,6 +2191,12 @@ void ceph_msg_last_put(struct kref *kref) m->nr_pages = 0; m->pages = NULL; + if (m->pagelist) { + ceph_pagelist_release(m->pagelist); + kfree(m->pagelist); + m->pagelist = NULL; + } + if (m->pool) ceph_msgpool_put(m->pool, m); else diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h index 7e2aab1d3ce..a7b68414509 100644 --- a/fs/ceph/messenger.h +++ b/fs/ceph/messenger.h @@ -85,6 +85,7 @@ struct ceph_msg { struct ceph_buffer *middle; struct page **pages; /* data payload. NOT OWNER. */ unsigned nr_pages; /* size of page array */ + struct ceph_pagelist *pagelist; /* instead of pages */ struct list_head list_head; struct kref kref; bool front_is_vmalloc; diff --git a/fs/ceph/pagelist.c b/fs/ceph/pagelist.c new file mode 100644 index 00000000000..370e9369547 --- /dev/null +++ b/fs/ceph/pagelist.c @@ -0,0 +1,54 @@ + +#include +#include + +#include "pagelist.h" + +int ceph_pagelist_release(struct ceph_pagelist *pl) +{ + if (pl->mapped_tail) + kunmap(pl->mapped_tail); + while (!list_empty(&pl->head)) { + struct page *page = list_first_entry(&pl->head, struct page, + lru); + list_del(&page->lru); + __free_page(page); + } + return 0; +} + +static int ceph_pagelist_addpage(struct ceph_pagelist *pl) +{ + struct page *page = alloc_page(GFP_NOFS); + if (!page) + return -ENOMEM; + pl->room += PAGE_SIZE; + list_add_tail(&page->lru, &pl->head); + if (pl->mapped_tail) + kunmap(pl->mapped_tail); + pl->mapped_tail = kmap(page); + return 0; +} + +int ceph_pagelist_append(struct ceph_pagelist *pl, void *buf, size_t len) +{ + while (pl->room < len) { + size_t bit = pl->room; + int ret; + + memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), + buf, bit); + pl->length += bit; + pl->room -= bit; + buf += bit; + len -= bit; + ret = ceph_pagelist_addpage(pl); + if (ret) + return ret; + } + + memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), buf, len); + pl->length += len; + pl->room -= len; + return 0; +} diff --git a/fs/ceph/pagelist.h b/fs/ceph/pagelist.h new file mode 100644 index 00000000000..e8a4187e108 --- /dev/null +++ b/fs/ceph/pagelist.h @@ -0,0 +1,54 @@ +#ifndef __FS_CEPH_PAGELIST_H +#define __FS_CEPH_PAGELIST_H + +#include + +struct ceph_pagelist { + struct list_head head; + void *mapped_tail; + size_t length; + size_t room; +}; + +static inline void ceph_pagelist_init(struct ceph_pagelist *pl) +{ + INIT_LIST_HEAD(&pl->head); + pl->mapped_tail = NULL; + pl->length = 0; + pl->room = 0; +} +extern int ceph_pagelist_release(struct ceph_pagelist *pl); + +extern int ceph_pagelist_append(struct ceph_pagelist *pl, void *d, size_t l); + +static inline int ceph_pagelist_encode_64(struct ceph_pagelist *pl, u64 v) +{ + __le64 ev = cpu_to_le64(v); + return ceph_pagelist_append(pl, &ev, sizeof(ev)); +} +static inline int ceph_pagelist_encode_32(struct ceph_pagelist *pl, u32 v) +{ + __le32 ev = cpu_to_le32(v); + return ceph_pagelist_append(pl, &ev, sizeof(ev)); +} +static inline int ceph_pagelist_encode_16(struct ceph_pagelist *pl, u16 v) +{ + __le16 ev = cpu_to_le16(v); + return ceph_pagelist_append(pl, &ev, sizeof(ev)); +} +static inline int ceph_pagelist_encode_8(struct ceph_pagelist *pl, u8 v) +{ + return ceph_pagelist_append(pl, &v, 1); +} +static inline int ceph_pagelist_encode_string(struct ceph_pagelist *pl, + char *s, size_t len) +{ + int ret = ceph_pagelist_encode_32(pl, len); + if (ret) + return ret; + if (len) + return ceph_pagelist_append(pl, s, len); + return 0; +} + +#endif -- cgit v1.2.3 From 103e2d3ae57d38d18aaac1b327266c1407499ac1 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 7 Jan 2010 16:12:36 -0800 Subject: ceph: remove unused erank field The ceph_entity_addr erank field is obsolete; remove it. Get rid of trivial addr comparison helpers while we're at it. Signed-off-by: Sage Weil --- fs/ceph/messenger.c | 19 ++++++++----------- fs/ceph/mon_client.c | 3 +-- fs/ceph/msgr.h | 18 ++---------------- fs/ceph/osd_client.c | 7 ++++--- 4 files changed, 15 insertions(+), 32 deletions(-) (limited to 'fs/ceph/messenger.c') diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c index c1106e8360f..1360708d750 100644 --- a/fs/ceph/messenger.c +++ b/fs/ceph/messenger.c @@ -1056,16 +1056,15 @@ static int process_banner(struct ceph_connection *con) * end may not yet know their ip address, so if it's 0.0.0.0, give * them the benefit of the doubt. */ - if (!ceph_entity_addr_is_local(&con->peer_addr, - &con->actual_peer_addr) && + if (memcmp(&con->peer_addr, &con->actual_peer_addr, + sizeof(con->peer_addr)) != 0 && !(addr_is_blank(&con->actual_peer_addr.in_addr) && con->actual_peer_addr.nonce == con->peer_addr.nonce)) { - pr_warning("wrong peer, want %s/%d, " - "got %s/%d\n", - pr_addr(&con->peer_addr.in_addr), - con->peer_addr.nonce, - pr_addr(&con->actual_peer_addr.in_addr), - con->actual_peer_addr.nonce); + pr_warning("wrong peer, want %s/%lld, got %s/%lld\n", + pr_addr(&con->peer_addr.in_addr), + le64_to_cpu(con->peer_addr.nonce), + pr_addr(&con->actual_peer_addr.in_addr), + le64_to_cpu(con->actual_peer_addr.nonce)); con->error_msg = "wrong peer at address"; return -1; } @@ -1934,8 +1933,7 @@ struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr) msgr->inst.addr = *myaddr; /* select a random nonce */ - get_random_bytes(&msgr->inst.addr.nonce, - sizeof(msgr->inst.addr.nonce)); + get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce)); encode_my_addr(msgr); dout("messenger_create %p\n", msgr); @@ -1966,7 +1964,6 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg) msg->hdr.src.name = con->msgr->inst.name; msg->hdr.src.addr = con->msgr->my_enc_addr; msg->hdr.orig_src = msg->hdr.src; - msg->hdr.dst_erank = con->peer_addr.erank; /* queue */ mutex_lock(&con->mutex); diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c index bb94006fc68..223e8bc207e 100644 --- a/fs/ceph/mon_client.c +++ b/fs/ceph/mon_client.c @@ -88,7 +88,7 @@ int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr) int i; for (i = 0; i < m->num_mon; i++) - if (ceph_entity_addr_equal(addr, &m->mon_inst[i].addr)) + if (memcmp(addr, &m->mon_inst[i].addr, sizeof(*addr)) == 0) return 1; return 0; } @@ -503,7 +503,6 @@ static int build_initial_monmap(struct ceph_mon_client *monc) return -ENOMEM; for (i = 0; i < num_mon; i++) { monc->monmap->mon_inst[i].addr = mon_addr[i]; - monc->monmap->mon_inst[i].addr.erank = 0; monc->monmap->mon_inst[i].addr.nonce = 0; monc->monmap->mon_inst[i].name.type = CEPH_ENTITY_TYPE_MON; diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h index be83f93182e..40b6189aa9e 100644 --- a/fs/ceph/msgr.h +++ b/fs/ceph/msgr.h @@ -61,24 +61,10 @@ extern const char *ceph_entity_type_name(int type); * entity_addr -- network address */ struct ceph_entity_addr { - __le32 erank; /* entity's rank in process */ - __le32 nonce; /* unique id for process (e.g. pid) */ + __le64 nonce; /* unique id for process (e.g. pid) */ struct sockaddr_storage in_addr; } __attribute__ ((packed)); -static inline bool ceph_entity_addr_is_local(const struct ceph_entity_addr *a, - const struct ceph_entity_addr *b) -{ - return a->nonce == b->nonce && - memcmp(&a->in_addr, &b->in_addr, sizeof(a->in_addr)) == 0; -} - -static inline bool ceph_entity_addr_equal(const struct ceph_entity_addr *a, - const struct ceph_entity_addr *b) -{ - return memcmp(a, b, sizeof(*a)) == 0; -} - struct ceph_entity_inst { struct ceph_entity_name name; struct ceph_entity_addr addr; @@ -147,7 +133,7 @@ struct ceph_msg_header { receiver: mask against ~PAGE_MASK */ struct ceph_entity_inst src, orig_src; - __le32 dst_erank; + __le32 reserved; __le32 crc; /* header crc32c */ } __attribute__ ((packed)); diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c index a0aac436d5d..80b868f7a0f 100644 --- a/fs/ceph/osd_client.c +++ b/fs/ceph/osd_client.c @@ -821,9 +821,10 @@ static void kick_requests(struct ceph_osd_client *osdc, n = rb_next(p); if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) || - !ceph_entity_addr_equal(&osd->o_con.peer_addr, - ceph_osd_addr(osdc->osdmap, - osd->o_osd))) + memcmp(&osd->o_con.peer_addr, + ceph_osd_addr(osdc->osdmap, + osd->o_osd), + sizeof(struct ceph_entity_addr)) != 0) reset_osd(osdc, osd); } } -- cgit v1.2.3 From 2450418c47b7998ad55a73f23707b1e21c371eef Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Fri, 8 Jan 2010 13:58:34 -0800 Subject: ceph: allocate middle of message before stating to read Both front and middle parts of the message are now being allocated at the ceph_alloc_msg(). Signed-off-by: Yehuda Sadeh --- fs/ceph/mds_client.c | 2 - fs/ceph/messenger.c | 142 +++++++++++++++++++++++++++++---------------------- fs/ceph/messenger.h | 9 +--- fs/ceph/mon_client.c | 25 ++++++--- fs/ceph/osd_client.c | 17 ++++-- 5 files changed, 115 insertions(+), 80 deletions(-) (limited to 'fs/ceph/messenger.c') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 623c67cd484..93998a0678c 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2953,8 +2953,6 @@ const static struct ceph_connection_operations mds_con_ops = { .get_authorizer = get_authorizer, .verify_authorizer_reply = verify_authorizer_reply, .peer_reset = peer_reset, - .alloc_msg = ceph_alloc_msg, - .alloc_middle = ceph_alloc_middle, }; diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c index 1360708d750..25de15c006b 100644 --- a/fs/ceph/messenger.c +++ b/fs/ceph/messenger.c @@ -1279,8 +1279,34 @@ static void process_ack(struct ceph_connection *con) +static int read_partial_message_section(struct ceph_connection *con, + struct kvec *section, unsigned int sec_len, + u32 *crc) +{ + int left; + int ret; + + BUG_ON(!section); + + while (section->iov_len < sec_len) { + BUG_ON(section->iov_base == NULL); + left = sec_len - section->iov_len; + ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base + + section->iov_len, left); + if (ret <= 0) + return ret; + section->iov_len += ret; + if (section->iov_len == sec_len) + *crc = crc32c(0, section->iov_base, + section->iov_len); + } + return 1; +} +static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con, + struct ceph_msg_header *hdr, + int *skip); /* * read (part of) a message. */ @@ -1292,6 +1318,7 @@ static int read_partial_message(struct ceph_connection *con) int to, want, left; unsigned front_len, middle_len, data_len, data_off; int datacrc = con->msgr->nocrc; + int skip; dout("read_partial_message con %p msg %p\n", con, m); @@ -1315,7 +1342,6 @@ static int read_partial_message(struct ceph_connection *con) } } } - front_len = le32_to_cpu(con->in_hdr.front_len); if (front_len > CEPH_MSG_MAX_FRONT_LEN) return -EIO; @@ -1330,8 +1356,8 @@ static int read_partial_message(struct ceph_connection *con) if (!con->in_msg) { dout("got hdr type %d front %d data %d\n", con->in_hdr.type, con->in_hdr.front_len, con->in_hdr.data_len); - con->in_msg = con->ops->alloc_msg(con, &con->in_hdr); - if (!con->in_msg) { + con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip); + if (skip) { /* skip this message */ pr_err("alloc_msg returned NULL, skipping message\n"); con->in_base_pos = -front_len - middle_len - data_len - @@ -1342,56 +1368,28 @@ static int read_partial_message(struct ceph_connection *con) if (IS_ERR(con->in_msg)) { ret = PTR_ERR(con->in_msg); con->in_msg = NULL; - con->error_msg = "out of memory for incoming message"; + con->error_msg = "error allocating memory for incoming message"; return ret; } m = con->in_msg; m->front.iov_len = 0; /* haven't read it yet */ + if (m->middle) + m->middle->vec.iov_len = 0; memcpy(&m->hdr, &con->in_hdr, sizeof(con->in_hdr)); } /* front */ - while (m->front.iov_len < front_len) { - BUG_ON(m->front.iov_base == NULL); - left = front_len - m->front.iov_len; - ret = ceph_tcp_recvmsg(con->sock, (char *)m->front.iov_base + - m->front.iov_len, left); - if (ret <= 0) - return ret; - m->front.iov_len += ret; - if (m->front.iov_len == front_len) - con->in_front_crc = crc32c(0, m->front.iov_base, - m->front.iov_len); - } + ret = read_partial_message_section(con, &m->front, front_len, + &con->in_front_crc); + if (ret <= 0) + return ret; /* middle */ - while (middle_len > 0 && (!m->middle || - m->middle->vec.iov_len < middle_len)) { - if (m->middle == NULL) { - ret = -EOPNOTSUPP; - if (con->ops->alloc_middle) - ret = con->ops->alloc_middle(con, m); - if (ret < 0) { - pr_err("alloc_middle fail skipping payload\n"); - con->in_base_pos = -middle_len - data_len - - sizeof(m->footer); - ceph_msg_put(con->in_msg); - con->in_msg = NULL; - con->in_tag = CEPH_MSGR_TAG_READY; - return 0; - } - m->middle->vec.iov_len = 0; - } - left = middle_len - m->middle->vec.iov_len; - ret = ceph_tcp_recvmsg(con->sock, - (char *)m->middle->vec.iov_base + - m->middle->vec.iov_len, left); + if (m->middle) { + ret = read_partial_message_section(con, &m->middle->vec, middle_len, + &con->in_middle_crc); if (ret <= 0) return ret; - m->middle->vec.iov_len += ret; - if (m->middle->vec.iov_len == middle_len) - con->in_middle_crc = crc32c(0, m->middle->vec.iov_base, - m->middle->vec.iov_len); } /* (page) data */ @@ -2115,24 +2113,6 @@ out: return ERR_PTR(-ENOMEM); } -/* - * Generic message allocator, for incoming messages. - */ -struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con, - struct ceph_msg_header *hdr) -{ - int type = le16_to_cpu(hdr->type); - int front_len = le32_to_cpu(hdr->front_len); - struct ceph_msg *msg = ceph_msg_new(type, front_len, 0, 0, NULL); - - if (!msg) { - pr_err("unable to allocate msg type %d len %d\n", - type, front_len); - return ERR_PTR(-ENOMEM); - } - return msg; -} - /* * Allocate "middle" portion of a message, if it is needed and wasn't * allocated by alloc_msg. This allows us to read a small fixed-size @@ -2140,7 +2120,7 @@ struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con, * propagate the error to the caller based on info in the front) when * the middle is too large. */ -int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg) +static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg) { int type = le16_to_cpu(msg->hdr.type); int middle_len = le32_to_cpu(msg->hdr.middle_len); @@ -2156,6 +2136,48 @@ int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg) return 0; } +/* + * Generic message allocator, for incoming messages. + */ +static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con, + struct ceph_msg_header *hdr, + int *skip) +{ + int type = le16_to_cpu(hdr->type); + int front_len = le32_to_cpu(hdr->front_len); + int middle_len = le32_to_cpu(hdr->middle_len); + struct ceph_msg *msg = NULL; + int ret; + + if (con->ops->alloc_msg) { + msg = con->ops->alloc_msg(con, hdr, skip); + if (IS_ERR(msg)) + return msg; + + if (*skip) + return NULL; + } + if (!msg) { + *skip = 0; + msg = ceph_msg_new(type, front_len, 0, 0, NULL); + if (!msg) { + pr_err("unable to allocate msg type %d len %d\n", + type, front_len); + return ERR_PTR(-ENOMEM); + } + } + + if (middle_len) { + ret = ceph_alloc_middle(con, msg); + + if (ret < 0) { + ceph_msg_put(msg); + return msg; + } + } + return msg; +} + /* * Free a generically kmalloc'd message. diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h index a7b68414509..b6bec59056d 100644 --- a/fs/ceph/messenger.h +++ b/fs/ceph/messenger.h @@ -44,9 +44,8 @@ struct ceph_connection_operations { void (*peer_reset) (struct ceph_connection *con); struct ceph_msg * (*alloc_msg) (struct ceph_connection *con, - struct ceph_msg_header *hdr); - int (*alloc_middle) (struct ceph_connection *con, - struct ceph_msg *msg); + struct ceph_msg_header *hdr, + int *skip); /* an incoming message has a data payload; tell me what pages I * should read the data into. */ int (*prepare_pages) (struct ceph_connection *con, struct ceph_msg *m, @@ -242,10 +241,6 @@ extern struct ceph_msg *ceph_msg_new(int type, int front_len, struct page **pages); extern void ceph_msg_kfree(struct ceph_msg *m); -extern struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con, - struct ceph_msg_header *hdr); -extern int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg); - static inline struct ceph_msg *ceph_msg_get(struct ceph_msg *msg) { diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c index 223e8bc207e..6c00b37cc55 100644 --- a/fs/ceph/mon_client.c +++ b/fs/ceph/mon_client.c @@ -692,21 +692,33 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) * Allocate memory for incoming message */ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con, - struct ceph_msg_header *hdr) + struct ceph_msg_header *hdr, + int *skip) { struct ceph_mon_client *monc = con->private; int type = le16_to_cpu(hdr->type); - int front = le32_to_cpu(hdr->front_len); + int front_len = le32_to_cpu(hdr->front_len); + struct ceph_msg *m; + *skip = 0; switch (type) { case CEPH_MSG_MON_SUBSCRIBE_ACK: - return ceph_msgpool_get(&monc->msgpool_subscribe_ack, front); + m = ceph_msgpool_get(&monc->msgpool_subscribe_ack, front_len); + break; case CEPH_MSG_STATFS_REPLY: - return ceph_msgpool_get(&monc->msgpool_statfs_reply, front); + m = ceph_msgpool_get(&monc->msgpool_statfs_reply, front_len); + break; case CEPH_MSG_AUTH_REPLY: - return ceph_msgpool_get(&monc->msgpool_auth_reply, front); + m = ceph_msgpool_get(&monc->msgpool_auth_reply, front_len); + break; + default: + return NULL; } - return ceph_alloc_msg(con, hdr); + + if (!m) + *skip = 1; + + return m; } /* @@ -749,5 +761,4 @@ const static struct ceph_connection_operations mon_con_ops = { .dispatch = dispatch, .fault = mon_fault, .alloc_msg = mon_alloc_msg, - .alloc_middle = ceph_alloc_middle, }; diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c index 8417e21a3cb..545e9361799 100644 --- a/fs/ceph/osd_client.c +++ b/fs/ceph/osd_client.c @@ -1304,18 +1304,28 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) } static struct ceph_msg *alloc_msg(struct ceph_connection *con, - struct ceph_msg_header *hdr) + struct ceph_msg_header *hdr, + int *skip) { struct ceph_osd *osd = con->private; struct ceph_osd_client *osdc = osd->o_osdc; int type = le16_to_cpu(hdr->type); int front = le32_to_cpu(hdr->front_len); + struct ceph_msg *m; + *skip = 0; switch (type) { case CEPH_MSG_OSD_OPREPLY: - return ceph_msgpool_get(&osdc->msgpool_op_reply, front); + m = ceph_msgpool_get(&osdc->msgpool_op_reply, front); + break; + default: + return NULL; } - return ceph_alloc_msg(con, hdr); + + if (!m) + *skip = 1; + + return m; } /* @@ -1390,6 +1400,5 @@ const static struct ceph_connection_operations osd_con_ops = { .verify_authorizer_reply = verify_authorizer_reply, .alloc_msg = alloc_msg, .fault = osd_reset, - .alloc_middle = ceph_alloc_middle, .prepare_pages = prepare_pages, }; -- cgit v1.2.3 From 9d7f0f139edfdce1a1539b100c617fd9182b0829 Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Mon, 11 Jan 2010 10:32:02 -0800 Subject: ceph: refactor messages data section allocation Signed-off-by: Yehuda Sadeh --- fs/ceph/messenger.c | 67 +++++++++++++++++++++++++++++++---------------------- 1 file changed, 39 insertions(+), 28 deletions(-) (limited to 'fs/ceph/messenger.c') diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c index 25de15c006b..e8742cc9ecd 100644 --- a/fs/ceph/messenger.c +++ b/fs/ceph/messenger.c @@ -1315,7 +1315,7 @@ static int read_partial_message(struct ceph_connection *con) struct ceph_msg *m = con->in_msg; void *p; int ret; - int to, want, left; + int to, left; unsigned front_len, middle_len, data_len, data_off; int datacrc = con->msgr->nocrc; int skip; @@ -1351,6 +1351,7 @@ static int read_partial_message(struct ceph_connection *con) data_len = le32_to_cpu(con->in_hdr.data_len); if (data_len > CEPH_MSG_MAX_DATA_LEN) return -EIO; + data_off = le16_to_cpu(con->in_hdr.data_off); /* allocate message? */ if (!con->in_msg) { @@ -1375,7 +1376,10 @@ static int read_partial_message(struct ceph_connection *con) m->front.iov_len = 0; /* haven't read it yet */ if (m->middle) m->middle->vec.iov_len = 0; - memcpy(&m->hdr, &con->in_hdr, sizeof(con->in_hdr)); + + con->in_msg_pos.page = 0; + con->in_msg_pos.page_pos = data_off & ~PAGE_MASK; + con->in_msg_pos.data_pos = 0; } /* front */ @@ -1393,31 +1397,6 @@ static int read_partial_message(struct ceph_connection *con) } /* (page) data */ - data_off = le16_to_cpu(m->hdr.data_off); - if (data_len == 0) - goto no_data; - - if (m->nr_pages == 0) { - con->in_msg_pos.page = 0; - con->in_msg_pos.page_pos = data_off & ~PAGE_MASK; - con->in_msg_pos.data_pos = 0; - /* find pages for data payload */ - want = calc_pages_for(data_off & ~PAGE_MASK, data_len); - ret = -1; - mutex_unlock(&con->mutex); - if (con->ops->prepare_pages) - ret = con->ops->prepare_pages(con, m, want); - mutex_lock(&con->mutex); - if (ret < 0) { - dout("%p prepare_pages failed, skipping payload\n", m); - con->in_base_pos = -data_len - sizeof(m->footer); - ceph_msg_put(con->in_msg); - con->in_msg = NULL; - con->in_tag = CEPH_MSGR_TAG_READY; - return 0; - } - BUG_ON(m->nr_pages < want); - } while (con->in_msg_pos.data_pos < data_len) { left = min((int)(data_len - con->in_msg_pos.data_pos), (int)(PAGE_SIZE - con->in_msg_pos.page_pos)); @@ -1440,7 +1419,6 @@ static int read_partial_message(struct ceph_connection *con) } } -no_data: /* footer */ to = sizeof(m->hdr) + sizeof(m->footer); while (con->in_base_pos < to) { @@ -2136,6 +2114,25 @@ static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg) return 0; } +static int ceph_alloc_data_section(struct ceph_connection *con, struct ceph_msg *msg) +{ + int ret; + int want; + int data_len = le32_to_cpu(msg->hdr.data_len); + unsigned data_off = le16_to_cpu(msg->hdr.data_off); + + want = calc_pages_for(data_off & ~PAGE_MASK, data_len); + ret = -1; + mutex_unlock(&con->mutex); + if (con->ops->prepare_pages) + ret = con->ops->prepare_pages(con, msg, want); + mutex_lock(&con->mutex); + + BUG_ON(msg->nr_pages < want); + + return ret; +} + /* * Generic message allocator, for incoming messages. */ @@ -2146,6 +2143,7 @@ static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con, int type = le16_to_cpu(hdr->type); int front_len = le32_to_cpu(hdr->front_len); int middle_len = le32_to_cpu(hdr->middle_len); + int data_len = le32_to_cpu(hdr->data_len); struct ceph_msg *msg = NULL; int ret; @@ -2166,6 +2164,7 @@ static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con, return ERR_PTR(-ENOMEM); } } + memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr)); if (middle_len) { ret = ceph_alloc_middle(con, msg); @@ -2175,6 +2174,18 @@ static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con, return msg; } } + + if (data_len) { + ret = ceph_alloc_data_section(con, msg); + + if (ret < 0) { + *skip = 1; + ceph_msg_put(msg); + return NULL; + } + } + + return msg; } -- cgit v1.2.3 From 0547a9b30a5ac8680325752b61d3ffa9d4971b6e Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Mon, 11 Jan 2010 14:47:13 -0800 Subject: ceph: alloc message data pages and check if tid exists Now doing it in the same callback that is also responsible for allocating the 'front' part of the message. If we get a message that we haven't got a corresponding tid for, mark it for skipping. Moving the mutex unlock/lock from the osd alloc_msg callback to the calling function in the messenger. Signed-off-by: Yehuda Sadeh --- fs/ceph/messenger.c | 33 ++------------------------ fs/ceph/messenger.h | 4 ---- fs/ceph/mon_client.c | 1 + fs/ceph/osd_client.c | 66 +++++++++++++++++++++++++++++++++------------------- 4 files changed, 45 insertions(+), 59 deletions(-) (limited to 'fs/ceph/messenger.c') diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c index e8742cc9ecd..f708803e685 100644 --- a/fs/ceph/messenger.c +++ b/fs/ceph/messenger.c @@ -2114,25 +2114,6 @@ static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg) return 0; } -static int ceph_alloc_data_section(struct ceph_connection *con, struct ceph_msg *msg) -{ - int ret; - int want; - int data_len = le32_to_cpu(msg->hdr.data_len); - unsigned data_off = le16_to_cpu(msg->hdr.data_off); - - want = calc_pages_for(data_off & ~PAGE_MASK, data_len); - ret = -1; - mutex_unlock(&con->mutex); - if (con->ops->prepare_pages) - ret = con->ops->prepare_pages(con, msg, want); - mutex_lock(&con->mutex); - - BUG_ON(msg->nr_pages < want); - - return ret; -} - /* * Generic message allocator, for incoming messages. */ @@ -2143,12 +2124,13 @@ static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con, int type = le16_to_cpu(hdr->type); int front_len = le32_to_cpu(hdr->front_len); int middle_len = le32_to_cpu(hdr->middle_len); - int data_len = le32_to_cpu(hdr->data_len); struct ceph_msg *msg = NULL; int ret; if (con->ops->alloc_msg) { + mutex_unlock(&con->mutex); msg = con->ops->alloc_msg(con, hdr, skip); + mutex_lock(&con->mutex); if (IS_ERR(msg)) return msg; @@ -2175,17 +2157,6 @@ static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con, } } - if (data_len) { - ret = ceph_alloc_data_section(con, msg); - - if (ret < 0) { - *skip = 1; - ceph_msg_put(msg); - return NULL; - } - } - - return msg; } diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h index b6bec59056d..dca2d32b40d 100644 --- a/fs/ceph/messenger.h +++ b/fs/ceph/messenger.h @@ -46,10 +46,6 @@ struct ceph_connection_operations { struct ceph_msg * (*alloc_msg) (struct ceph_connection *con, struct ceph_msg_header *hdr, int *skip); - /* an incoming message has a data payload; tell me what pages I - * should read the data into. */ - int (*prepare_pages) (struct ceph_connection *con, struct ceph_msg *m, - int want); }; extern const char *ceph_name_type_str(int t); diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c index 6c00b37cc55..3f7ae7f73c5 100644 --- a/fs/ceph/mon_client.c +++ b/fs/ceph/mon_client.c @@ -701,6 +701,7 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con, struct ceph_msg *m; *skip = 0; + switch (type) { case CEPH_MSG_MON_SUBSCRIBE_ACK: m = ceph_msgpool_get(&monc->msgpool_subscribe_ack, front_len); diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c index 545e9361799..44abe299c69 100644 --- a/fs/ceph/osd_client.c +++ b/fs/ceph/osd_client.c @@ -998,31 +998,26 @@ bad: * find those pages. * 0 = success, -1 failure. */ -static int prepare_pages(struct ceph_connection *con, struct ceph_msg *m, - int want) +static int prepare_pages(struct ceph_connection *con, + struct ceph_msg_header *hdr, + struct ceph_osd_request *req, + u64 tid, + struct ceph_msg *m) { struct ceph_osd *osd = con->private; struct ceph_osd_client *osdc; - struct ceph_osd_request *req; - u64 tid; int ret = -1; - int type = le16_to_cpu(m->hdr.type); + int data_len = le32_to_cpu(hdr->data_len); + unsigned data_off = le16_to_cpu(hdr->data_off); + + int want = calc_pages_for(data_off & ~PAGE_MASK, data_len); if (!osd) return -1; + osdc = osd->o_osdc; dout("prepare_pages on msg %p want %d\n", m, want); - if (unlikely(type != CEPH_MSG_OSD_OPREPLY)) - return -1; /* hmm! */ - - tid = le64_to_cpu(m->hdr.tid); - mutex_lock(&osdc->request_mutex); - req = __lookup_request(osdc, tid); - if (!req) { - dout("prepare_pages unknown tid %llu\n", tid); - goto out; - } dout("prepare_pages tid %llu has %d pages, want %d\n", tid, req->r_num_pages, want); if (unlikely(req->r_num_pages < want)) @@ -1040,7 +1035,8 @@ static int prepare_pages(struct ceph_connection *con, struct ceph_msg *m, m->nr_pages = req->r_num_pages; ret = 0; /* success */ out: - mutex_unlock(&osdc->request_mutex); + BUG_ON(ret < 0 || m->nr_pages < want); + return ret; } @@ -1311,19 +1307,42 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con, struct ceph_osd_client *osdc = osd->o_osdc; int type = le16_to_cpu(hdr->type); int front = le32_to_cpu(hdr->front_len); + int data_len = le32_to_cpu(hdr->data_len); struct ceph_msg *m; + struct ceph_osd_request *req; + u64 tid; + int err; *skip = 0; - switch (type) { - case CEPH_MSG_OSD_OPREPLY: - m = ceph_msgpool_get(&osdc->msgpool_op_reply, front); - break; - default: + if (type != CEPH_MSG_OSD_OPREPLY) return NULL; - } - if (!m) + tid = le64_to_cpu(hdr->tid); + mutex_lock(&osdc->request_mutex); + req = __lookup_request(osdc, tid); + if (!req) { + *skip = 1; + m = NULL; + dout("prepare_pages unknown tid %llu\n", tid); + goto out; + } + m = ceph_msgpool_get(&osdc->msgpool_op_reply, front); + if (!m) { *skip = 1; + goto out; + } + + if (data_len > 0) { + err = prepare_pages(con, hdr, req, tid, m); + if (err < 0) { + *skip = 1; + ceph_msg_put(m); + m = ERR_PTR(err); + } + } + +out: + mutex_unlock(&osdc->request_mutex); return m; } @@ -1400,5 +1419,4 @@ const static struct ceph_connection_operations osd_con_ops = { .verify_authorizer_reply = verify_authorizer_reply, .alloc_msg = alloc_msg, .fault = osd_reset, - .prepare_pages = prepare_pages, }; -- cgit v1.2.3 From 0d59ab81c3d3adf466c3fd37d7fb6d46b05d1fd4 Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Wed, 13 Jan 2010 17:03:23 -0800 Subject: ceph: keep reserved replies on the request structure This includes treating all the data preallocation and revokation at the same place, not having to have a special case for the reserved pages. Signed-off-by: Yehuda Sadeh --- fs/ceph/messenger.c | 20 ++++----- fs/ceph/messenger.h | 4 +- fs/ceph/osd_client.c | 118 ++++++++++++++++++++++++++++++++++++--------------- fs/ceph/osd_client.h | 8 ++-- 4 files changed, 100 insertions(+), 50 deletions(-) (limited to 'fs/ceph/messenger.c') diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c index f708803e685..81bc779adb9 100644 --- a/fs/ceph/messenger.c +++ b/fs/ceph/messenger.c @@ -1985,30 +1985,30 @@ void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg) } /* - * Revoke a page vector that we may be reading data into + * Revoke a message that we may be reading data into */ -void ceph_con_revoke_pages(struct ceph_connection *con, struct page **pages) +void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg) { mutex_lock(&con->mutex); - if (con->in_msg && con->in_msg->pages == pages) { + if (con->in_msg && con->in_msg == msg) { + unsigned front_len = le32_to_cpu(con->in_hdr.front_len); + unsigned middle_len = le32_to_cpu(con->in_hdr.middle_len); unsigned data_len = le32_to_cpu(con->in_hdr.data_len); /* skip rest of message */ - dout("con_revoke_pages %p msg %p pages %p revoked\n", con, - con->in_msg, pages); - if (con->in_msg_pos.data_pos < data_len) - con->in_base_pos = con->in_msg_pos.data_pos - data_len; - else + dout("con_revoke_pages %p msg %p revoked\n", con, msg); con->in_base_pos = con->in_base_pos - sizeof(struct ceph_msg_header) - + front_len - + middle_len - + data_len - sizeof(struct ceph_msg_footer); - con->in_msg->pages = NULL; ceph_msg_put(con->in_msg); con->in_msg = NULL; con->in_tag = CEPH_MSGR_TAG_READY; } else { dout("con_revoke_pages %p msg %p pages %p no-op\n", - con, con->in_msg, pages); + con, con->in_msg, msg); } mutex_unlock(&con->mutex); } diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h index dca2d32b40d..c26a3d8aa78 100644 --- a/fs/ceph/messenger.h +++ b/fs/ceph/messenger.h @@ -226,8 +226,8 @@ extern void ceph_con_open(struct ceph_connection *con, extern void ceph_con_close(struct ceph_connection *con); extern void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg); extern void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg); -extern void ceph_con_revoke_pages(struct ceph_connection *con, - struct page **pages); +extern void ceph_con_revoke_message(struct ceph_connection *con, + struct ceph_msg *msg); extern void ceph_con_keepalive(struct ceph_connection *con); extern struct ceph_connection *ceph_con_get(struct ceph_connection *con); extern void ceph_con_put(struct ceph_connection *con); diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c index 44abe299c69..df210683971 100644 --- a/fs/ceph/osd_client.c +++ b/fs/ceph/osd_client.c @@ -13,6 +13,8 @@ #include "decode.h" #include "auth.h" +#define OSD_REPLY_RESERVE_FRONT_LEN 512 + const static struct ceph_connection_operations osd_con_ops; static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd); @@ -73,6 +75,16 @@ static void calc_layout(struct ceph_osd_client *osdc, req->r_oid, req->r_oid_len, objoff, objlen, req->r_num_pages); } +static void remove_replies(struct ceph_osd_request *req) +{ + int i; + int max = ARRAY_SIZE(req->replies); + + for (i=0; ireplies[i]) + ceph_msg_put(req->replies[i]); + } +} /* * requests @@ -87,12 +99,13 @@ void ceph_osdc_release_request(struct kref *kref) ceph_msg_put(req->r_request); if (req->r_reply) ceph_msg_put(req->r_reply); - if (req->r_con_filling_pages) { + remove_replies(req); + if (req->r_con_filling_msg) { dout("release_request revoking pages %p from con %p\n", - req->r_pages, req->r_con_filling_pages); - ceph_con_revoke_pages(req->r_con_filling_pages, - req->r_pages); - ceph_con_put(req->r_con_filling_pages); + req->r_pages, req->r_con_filling_msg); + ceph_con_revoke_message(req->r_con_filling_msg, + req->r_reply); + ceph_con_put(req->r_con_filling_msg); } if (req->r_own_pages) ceph_release_page_vector(req->r_pages, @@ -104,6 +117,60 @@ void ceph_osdc_release_request(struct kref *kref) kfree(req); } +static int alloc_replies(struct ceph_osd_request *req, int num_reply) +{ + int i; + int max = ARRAY_SIZE(req->replies); + + BUG_ON(num_reply > max); + + for (i=0; ireplies[i] = ceph_msg_new(0, OSD_REPLY_RESERVE_FRONT_LEN, 0, 0, NULL); + if (IS_ERR(req->replies[i])) { + int j; + int err = PTR_ERR(req->replies[i]); + for (j = 0; j<=i; j++) { + ceph_msg_put(req->replies[j]); + } + return err; + } + } + + for (; ireplies[i] = NULL; + } + + req->cur_reply = 0; + + return 0; +} + +static struct ceph_msg *__get_next_reply(struct ceph_connection *con, + struct ceph_osd_request *req, + int front_len) +{ + struct ceph_msg *reply; + if (req->r_con_filling_msg) { + dout("revoking reply msg %p from old con %p\n", req->r_reply, + req->r_con_filling_msg); + ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply); + ceph_con_put(req->r_con_filling_msg); + req->cur_reply = 0; + } + reply = req->replies[req->cur_reply]; + if (!reply || front_len > OSD_REPLY_RESERVE_FRONT_LEN) { + /* maybe we can allocate it now? */ + reply = ceph_msg_new(0, front_len, 0, 0, NULL); + if (!reply || IS_ERR(reply)) { + pr_err(" reply alloc failed, front_len=%d\n", front_len); + return ERR_PTR(-ENOMEM); + } + } + req->r_con_filling_msg = ceph_con_get(con); + req->r_reply = ceph_msg_get(reply); /* for duration of read over socket */ + return ceph_msg_get(reply); +} + /* * build new request AND message, calculate layout, and adjust file * extent as needed. @@ -147,7 +214,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, if (req == NULL) return ERR_PTR(-ENOMEM); - err = ceph_msgpool_resv(&osdc->msgpool_op_reply, num_reply); + err = alloc_replies(req, num_reply); if (err) { ceph_osdc_put_request(req); return ERR_PTR(-ENOMEM); @@ -173,7 +240,6 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, else msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, 0, 0, NULL); if (IS_ERR(msg)) { - ceph_msgpool_resv(&osdc->msgpool_op_reply, -num_reply); ceph_osdc_put_request(req); return ERR_PTR(PTR_ERR(msg)); } @@ -471,8 +537,6 @@ static void __unregister_request(struct ceph_osd_client *osdc, rb_erase(&req->r_node, &osdc->requests); osdc->num_requests--; - ceph_msgpool_resv(&osdc->msgpool_op_reply, -req->r_num_prealloc_reply); - if (req->r_osd) { /* make sure the original request isn't in flight. */ ceph_con_revoke(&req->r_osd->o_con, req->r_request); @@ -724,12 +788,12 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, flags = le32_to_cpu(rhead->flags); /* - * if this connection filled our pages, drop our reference now, to + * if this connection filled our message, drop our reference now, to * avoid a (safe but slower) revoke later. */ - if (req->r_con_filling_pages == con && req->r_pages == msg->pages) { - dout(" got pages, dropping con_filling_pages ref %p\n", con); - req->r_con_filling_pages = NULL; + if (req->r_con_filling_msg == con && req->r_reply == msg) { + dout(" got pages, dropping con_filling_msg ref %p\n", con); + req->r_con_filling_msg = NULL; ceph_con_put(con); } @@ -998,7 +1062,7 @@ bad: * find those pages. * 0 = success, -1 failure. */ -static int prepare_pages(struct ceph_connection *con, +static int __prepare_pages(struct ceph_connection *con, struct ceph_msg_header *hdr, struct ceph_osd_request *req, u64 tid, @@ -1017,20 +1081,10 @@ static int prepare_pages(struct ceph_connection *con, osdc = osd->o_osdc; - dout("prepare_pages on msg %p want %d\n", m, want); - dout("prepare_pages tid %llu has %d pages, want %d\n", + dout("__prepare_pages on msg %p tid %llu, has %d pages, want %d\n", m, tid, req->r_num_pages, want); if (unlikely(req->r_num_pages < want)) goto out; - - if (req->r_con_filling_pages) { - dout("revoking pages %p from old con %p\n", req->r_pages, - req->r_con_filling_pages); - ceph_con_revoke_pages(req->r_con_filling_pages, req->r_pages); - ceph_con_put(req->r_con_filling_pages); - } - req->r_con_filling_pages = ceph_con_get(con); - req->r_reply = ceph_msg_get(m); /* for duration of read over socket */ m->pages = req->r_pages; m->nr_pages = req->r_num_pages; ret = 0; /* success */ @@ -1164,13 +1218,8 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) err = ceph_msgpool_init(&osdc->msgpool_op, 4096, 10, true); if (err < 0) goto out_mempool; - err = ceph_msgpool_init(&osdc->msgpool_op_reply, 512, 0, false); - if (err < 0) - goto out_msgpool; return 0; -out_msgpool: - ceph_msgpool_destroy(&osdc->msgpool_op); out_mempool: mempool_destroy(osdc->req_mempool); out: @@ -1186,7 +1235,6 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc) } mempool_destroy(osdc->req_mempool); ceph_msgpool_destroy(&osdc->msgpool_op); - ceph_msgpool_destroy(&osdc->msgpool_op_reply); } /* @@ -1323,17 +1371,17 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con, if (!req) { *skip = 1; m = NULL; - dout("prepare_pages unknown tid %llu\n", tid); + dout("alloc_msg unknown tid %llu\n", tid); goto out; } - m = ceph_msgpool_get(&osdc->msgpool_op_reply, front); - if (!m) { + m = __get_next_reply(con, req, front); + if (!m || IS_ERR(m)) { *skip = 1; goto out; } if (data_len > 0) { - err = prepare_pages(con, hdr, req, tid, m); + err = __prepare_pages(con, hdr, req, tid, m); if (err < 0) { *skip = 1; ceph_msg_put(m); diff --git a/fs/ceph/osd_client.h b/fs/ceph/osd_client.h index 4162c6810a8..8d533d9406f 100644 --- a/fs/ceph/osd_client.h +++ b/fs/ceph/osd_client.h @@ -44,7 +44,7 @@ struct ceph_osd_request { struct ceph_osd *r_osd; struct ceph_pg r_pgid; - struct ceph_connection *r_con_filling_pages; + struct ceph_connection *r_con_filling_msg; struct ceph_msg *r_request, *r_reply; int r_result; @@ -75,6 +75,9 @@ struct ceph_osd_request { struct page **r_pages; /* pages for data payload */ int r_pages_from_pool; int r_own_pages; /* if true, i own page list */ + + struct ceph_msg *replies[2]; + int cur_reply; }; struct ceph_osd_client { @@ -98,8 +101,7 @@ struct ceph_osd_client { mempool_t *req_mempool; - struct ceph_msgpool msgpool_op; - struct ceph_msgpool msgpool_op_reply; + struct ceph_msgpool msgpool_op; }; extern int ceph_osdc_init(struct ceph_osd_client *osdc, -- cgit v1.2.3 From ac8839d7b264d0fa478fca7c4f9b6bb833540a80 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 27 Jan 2010 14:28:10 -0800 Subject: ceph: include type in ceph_entity_addr, filepath Include a type/version in ceph_entity_addr and filepath. Include extra byte in filepath encoding as necessary. Signed-off-by: Sage Weil --- fs/ceph/decode.h | 1 + fs/ceph/mds_client.c | 2 +- fs/ceph/messenger.c | 1 + fs/ceph/msgr.h | 5 +++-- 4 files changed, 6 insertions(+), 3 deletions(-) (limited to 'fs/ceph/messenger.c') diff --git a/fs/ceph/decode.h b/fs/ceph/decode.h index 10de8489624..b90a33b948c 100644 --- a/fs/ceph/decode.h +++ b/fs/ceph/decode.h @@ -138,6 +138,7 @@ static inline void ceph_encode_filepath(void **p, void *end, { u32 len = path ? strlen(path) : 0; BUG_ON(*p + sizeof(ino) + sizeof(len) + len > end); + ceph_encode_8(p, 1); ceph_encode_64(p, ino); ceph_encode_32(p, len); if (len) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 93998a0678c..4e3e8b229e6 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1325,7 +1325,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, } len = sizeof(*head) + - pathlen1 + pathlen2 + 2*(sizeof(u32) + sizeof(u64)); + pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)); /* calculate (max) length for cap releases */ len += sizeof(struct ceph_mds_request_release) * diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c index 81bc779adb9..e4e8d4439d3 100644 --- a/fs/ceph/messenger.c +++ b/fs/ceph/messenger.c @@ -1909,6 +1909,7 @@ struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr) msgr->inst.addr = *myaddr; /* select a random nonce */ + msgr->inst.addr.type = 0; get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce)); encode_my_addr(msgr); diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h index 40b6189aa9e..8aaab414f3f 100644 --- a/fs/ceph/msgr.h +++ b/fs/ceph/msgr.h @@ -21,7 +21,7 @@ * whenever the wire protocol changes. try to keep this string length * constant. */ -#define CEPH_BANNER "ceph v026" +#define CEPH_BANNER "ceph v027" #define CEPH_BANNER_MAX_LEN 30 @@ -61,7 +61,8 @@ extern const char *ceph_entity_type_name(int type); * entity_addr -- network address */ struct ceph_entity_addr { - __le64 nonce; /* unique id for process (e.g. pid) */ + __le32 type; + __le32 nonce; /* unique id for process (e.g. pid) */ struct sockaddr_storage in_addr; } __attribute__ ((packed)); -- cgit v1.2.3 From 9bd2e6f8ba71facf1cadb7154a7e0e4d345a6aba Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 2 Feb 2010 16:21:06 -0800 Subject: ceph: allow renewal of auth credentials Add infrastructure to allow the mon_client to periodically renew its auth credentials. Also add a messenger callback that will force such a renewal if a peer rejects our authenticator. Signed-off-by: Yehuda Sadeh Signed-off-by: Sage Weil --- fs/ceph/auth.c | 61 ++++++++++++++++++++++++++++++++++++---------------- fs/ceph/auth.h | 7 ++++++ fs/ceph/mds_client.c | 13 +++++++++++ fs/ceph/messenger.c | 9 ++++++++ fs/ceph/messenger.h | 1 + fs/ceph/mon_client.c | 55 ++++++++++++++++++++++++++++++++++++++++------ fs/ceph/mon_client.h | 3 +++ fs/ceph/osd_client.c | 12 +++++++++++ fs/ceph/super.c | 12 +++++------ fs/ceph/super.h | 4 ++-- 10 files changed, 144 insertions(+), 33 deletions(-) (limited to 'fs/ceph/messenger.c') diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c index 32f2e2a021a..d5872d4f92b 100644 --- a/fs/ceph/auth.c +++ b/fs/ceph/auth.c @@ -125,6 +125,30 @@ bad: return -ERANGE; } +int ceph_build_auth_request(struct ceph_auth_client *ac, + void *msg_buf, size_t msg_len) +{ + struct ceph_mon_request_header *monhdr = msg_buf; + void *p = monhdr + 1; + void *end = msg_buf + msg_len; + int ret; + + monhdr->have_version = 0; + monhdr->session_mon = cpu_to_le16(-1); + monhdr->session_mon_tid = 0; + + ceph_encode_32(&p, ac->protocol); + + ret = ac->ops->build_request(ac, p + sizeof(u32), end); + if (ret < 0) { + pr_err("error %d building request\n", ret); + return ret; + } + dout(" built request %d bytes\n", ret); + ceph_encode_32(&p, ret); + return p + ret - msg_buf; +} + /* * Handle auth message from monitor. */ @@ -188,28 +212,13 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac, goto out; } } + + ac->negotiating = false; } ret = ac->ops->handle_reply(ac, result, payload, payload_end); if (ret == -EAGAIN) { - struct ceph_mon_request_header *monhdr = reply_buf; - void *p = reply_buf + 1; - void *end = reply_buf + reply_len; - - monhdr->have_version = 0; - monhdr->session_mon = cpu_to_le16(-1); - monhdr->session_mon_tid = 0; - - ceph_encode_32(&p, ac->protocol); - - ret = ac->ops->build_request(ac, p + sizeof(u32), end); - if (ret < 0) { - pr_err("error %d building request\n", ret); - goto out; - } - dout(" built request %d bytes\n", ret); - ceph_encode_32(&p, ret); - return p + ret - reply_buf; + return ceph_build_auth_request(ac, reply_buf, reply_len); } else if (ret) { pr_err("authentication error %d\n", ret); return ret; @@ -222,4 +231,20 @@ out: return ret; } +int ceph_build_auth(struct ceph_auth_client *ac, + void *msg_buf, size_t msg_len) +{ + if (!ac->protocol) + return ceph_auth_build_hello(ac, msg_buf, msg_len); + BUG_ON(!ac->ops); + if (!ac->ops->is_authenticated(ac)) + return ceph_build_auth_request(ac, msg_buf, msg_len); + return 0; +} +int ceph_auth_is_authenticated(struct ceph_auth_client *ac) +{ + if (!ac->ops) + return 0; + return ac->ops->is_authenticated(ac); +} diff --git a/fs/ceph/auth.h b/fs/ceph/auth.h index 4d8cdf6bb3b..ca4f57cfb26 100644 --- a/fs/ceph/auth.h +++ b/fs/ceph/auth.h @@ -42,6 +42,8 @@ struct ceph_auth_client_ops { struct ceph_authorizer *a, size_t len); void (*destroy_authorizer)(struct ceph_auth_client *ac, struct ceph_authorizer *a); + void (*invalidate_authorizer)(struct ceph_auth_client *ac, + int peer_type); /* reset when we (re)connect to a monitor */ void (*reset)(struct ceph_auth_client *ac); @@ -74,4 +76,9 @@ extern int ceph_handle_auth_reply(struct ceph_auth_client *ac, void *reply_buf, size_t reply_len); extern int ceph_entity_name_encode(const char *name, void **p, void *end); +extern int ceph_build_auth(struct ceph_auth_client *ac, + void *msg_buf, size_t msg_len); + +extern int ceph_auth_is_authenticated(struct ceph_auth_client *ac); + #endif diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 4e3e8b229e6..aa8506bad42 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2946,12 +2946,25 @@ static int verify_authorizer_reply(struct ceph_connection *con, int len) return ac->ops->verify_authorizer_reply(ac, s->s_authorizer, len); } +static int invalidate_authorizer(struct ceph_connection *con) +{ + struct ceph_mds_session *s = con->private; + struct ceph_mds_client *mdsc = s->s_mdsc; + struct ceph_auth_client *ac = mdsc->client->monc.auth; + + if (ac->ops->invalidate_authorizer) + ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS); + + return ceph_monc_validate_auth(&mdsc->client->monc); +} + const static struct ceph_connection_operations mds_con_ops = { .get = con_get, .put = con_put, .dispatch = dispatch, .get_authorizer = get_authorizer, .verify_authorizer_reply = verify_authorizer_reply, + .invalidate_authorizer = invalidate_authorizer, .peer_reset = peer_reset, }; diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c index e4e8d4439d3..c4341784ec8 100644 --- a/fs/ceph/messenger.c +++ b/fs/ceph/messenger.c @@ -1849,6 +1849,15 @@ static void ceph_fault(struct ceph_connection *con) con->in_msg = NULL; } + /* + * in case we faulted due to authentication, invalidate our + * current tickets so that we can get new ones. + */ + if (con->auth_retry && con->ops->invalidate_authorizer) { + dout("calling invalidate_authorizer()\n"); + con->ops->invalidate_authorizer(con); + } + /* If there are no messages in the queue, place the connection * in a STANDBY state (i.e., don't try to reconnect just yet). */ if (list_empty(&con->out_queue) && !con->out_keepalive_pending) { diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h index c26a3d8aa78..c9735378be3 100644 --- a/fs/ceph/messenger.h +++ b/fs/ceph/messenger.h @@ -32,6 +32,7 @@ struct ceph_connection_operations { void **buf, int *len, int *proto, void **reply_buf, int *reply_len, int force_new); int (*verify_authorizer_reply) (struct ceph_connection *con, int len); + int (*invalidate_authorizer)(struct ceph_connection *con); /* protocol version mismatch */ void (*bad_proto) (struct ceph_connection *con); diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c index 3f7ae7f73c5..fec41a0eff8 100644 --- a/fs/ceph/mon_client.c +++ b/fs/ceph/mon_client.c @@ -29,6 +29,8 @@ const static struct ceph_connection_operations mon_con_ops; +static int __validate_auth(struct ceph_mon_client *monc); + /* * Decode a monmap blob (e.g., during mount). */ @@ -103,6 +105,7 @@ static void __close_session(struct ceph_mon_client *monc) ceph_con_revoke(monc->con, monc->m_auth); ceph_con_close(monc->con); monc->cur_mon = -1; + monc->pending_auth = 0; ceph_auth_reset(monc->auth); } } @@ -334,7 +337,7 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc, out: mutex_unlock(&monc->mutex); - wake_up(&client->mount_wq); + wake_up(&client->auth_wq); } /* @@ -477,6 +480,11 @@ static void delayed_work(struct work_struct *work) __open_session(monc); /* continue hunting */ } else { ceph_con_keepalive(monc->con); + mutex_unlock(&monc->mutex); + + __validate_auth(monc); + + mutex_lock(&monc->mutex); if (monc->auth->ops->is_authenticated(monc->auth)) __send_subscribe(monc); } @@ -557,6 +565,7 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) goto out_pool2; monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, 0, 0, NULL); + monc->pending_auth = 0; if (IS_ERR(monc->m_auth)) { err = PTR_ERR(monc->m_auth); monc->m_auth = NULL; @@ -614,6 +623,15 @@ void ceph_monc_stop(struct ceph_mon_client *monc) kfree(monc->monmap); } +static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len) +{ + monc->pending_auth = 1; + monc->m_auth->front.iov_len = len; + monc->m_auth->hdr.front_len = cpu_to_le32(len); + ceph_msg_get(monc->m_auth); /* keep our ref */ + ceph_con_send(monc->con, monc->m_auth); +} + static void handle_auth_reply(struct ceph_mon_client *monc, struct ceph_msg *msg) @@ -621,18 +639,16 @@ static void handle_auth_reply(struct ceph_mon_client *monc, int ret; mutex_lock(&monc->mutex); + monc->pending_auth = 0; ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base, msg->front.iov_len, monc->m_auth->front.iov_base, monc->m_auth->front_max); if (ret < 0) { - monc->client->mount_err = ret; - wake_up(&monc->client->mount_wq); + monc->client->auth_err = ret; + wake_up(&monc->client->auth_wq); } else if (ret > 0) { - monc->m_auth->front.iov_len = ret; - monc->m_auth->hdr.front_len = cpu_to_le32(ret); - ceph_msg_get(monc->m_auth); /* keep our ref */ - ceph_con_send(monc->con, monc->m_auth); + __send_prepared_auth_request(monc, ret); } else if (monc->auth->ops->is_authenticated(monc->auth)) { dout("authenticated, starting session\n"); @@ -645,6 +661,31 @@ static void handle_auth_reply(struct ceph_mon_client *monc, mutex_unlock(&monc->mutex); } +static int __validate_auth(struct ceph_mon_client *monc) +{ + int ret; + + if (monc->pending_auth) + return 0; + + ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base, + monc->m_auth->front_max); + if (ret <= 0) + return ret; /* either an error, or no need to authenticate */ + __send_prepared_auth_request(monc, ret); + return 0; +} + +int ceph_monc_validate_auth(struct ceph_mon_client *monc) +{ + int ret; + + mutex_lock(&monc->mutex); + ret = __validate_auth(monc); + mutex_unlock(&monc->mutex); + return ret; +} + /* * handle incoming message */ diff --git a/fs/ceph/mon_client.h b/fs/ceph/mon_client.h index c75b53302ec..5ca8e48d437 100644 --- a/fs/ceph/mon_client.h +++ b/fs/ceph/mon_client.h @@ -61,6 +61,7 @@ struct ceph_mon_client { struct ceph_auth_client *auth; struct ceph_msg *m_auth; + int pending_auth; bool hunting; int cur_mon; /* last monitor i contacted */ @@ -110,6 +111,8 @@ extern int ceph_monc_do_statfs(struct ceph_mon_client *monc, extern int ceph_monc_open_session(struct ceph_mon_client *monc); +extern int ceph_monc_validate_auth(struct ceph_mon_client *monc); + #endif diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c index 944759b3079..35c8afea13e 100644 --- a/fs/ceph/osd_client.c +++ b/fs/ceph/osd_client.c @@ -1448,6 +1448,17 @@ static int verify_authorizer_reply(struct ceph_connection *con, int len) return ac->ops->verify_authorizer_reply(ac, o->o_authorizer, len); } +static int invalidate_authorizer(struct ceph_connection *con) +{ + struct ceph_osd *o = con->private; + struct ceph_osd_client *osdc = o->o_osdc; + struct ceph_auth_client *ac = osdc->client->monc.auth; + + if (ac->ops->invalidate_authorizer) + ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD); + + return ceph_monc_validate_auth(&osdc->client->monc); +} const static struct ceph_connection_operations osd_con_ops = { .get = get_osd_con, @@ -1455,6 +1466,7 @@ const static struct ceph_connection_operations osd_con_ops = { .dispatch = dispatch, .get_authorizer = get_authorizer, .verify_authorizer_reply = verify_authorizer_reply, + .invalidate_authorizer = invalidate_authorizer, .alloc_msg = alloc_msg, .fault = osd_reset, }; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index cd81c84e96f..3a2548951fe 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -542,7 +542,7 @@ static struct ceph_client *ceph_create_client(struct ceph_mount_args *args) mutex_init(&client->mount_mutex); - init_waitqueue_head(&client->mount_wq); + init_waitqueue_head(&client->auth_wq); client->sb = NULL; client->mount_state = CEPH_MOUNT_MOUNTING; @@ -550,7 +550,7 @@ static struct ceph_client *ceph_create_client(struct ceph_mount_args *args) client->msgr = NULL; - client->mount_err = 0; + client->auth_err = 0; atomic_long_set(&client->writeback_count, 0); err = bdi_init(&client->backing_dev_info); @@ -742,13 +742,13 @@ static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt, /* wait */ dout("mount waiting for mon_map\n"); - err = wait_event_interruptible_timeout(client->mount_wq, /* FIXME */ - have_mon_map(client) || (client->mount_err < 0), + err = wait_event_interruptible_timeout(client->auth_wq, + have_mon_map(client) || (client->auth_err < 0), timeout); if (err == -EINTR || err == -ERESTARTSYS) goto out; - if (client->mount_err < 0) { - err = client->mount_err; + if (client->auth_err < 0) { + err = client->auth_err; goto out; } } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 62d9ae482d7..770f7b507fc 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -123,9 +123,9 @@ struct ceph_client { struct super_block *sb; unsigned long mount_state; - wait_queue_head_t mount_wq; + wait_queue_head_t auth_wq; - int mount_err; + int auth_err; struct ceph_messenger *msgr; /* messenger instance */ struct ceph_mon_client monc; -- cgit v1.2.3 From 6c5d1a49e5e88ee831117f4b2375829933ad15da Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sat, 13 Feb 2010 20:29:31 -0800 Subject: ceph: fix msgr to keep sent messages until acked The test was backwards from commit b3d1dbbd: keep the message if the connection _isn't_ lossy. This allows the client to continue when the TCP connection drops for some reason (network glitch) but both ends survive. Signed-off-by: Sage Weil --- fs/ceph/messenger.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/ceph/messenger.c') diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c index c4341784ec8..44bdaf43992 100644 --- a/fs/ceph/messenger.c +++ b/fs/ceph/messenger.c @@ -463,11 +463,11 @@ static void prepare_write_message(struct ceph_connection *con) struct ceph_msg, list_head); con->out_msg = m; if (test_bit(LOSSYTX, &con->state)) { + list_del_init(&m->list_head); + } else { /* put message on sent list */ ceph_msg_get(m); list_move_tail(&m->list_head, &con->out_sent); - } else { - list_del_init(&m->list_head); } m->hdr.seq = cpu_to_le64(++con->out_seq); -- cgit v1.2.3 From e2663ab60de59d20fa33da3528f6d5359f8eb003 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 16 Feb 2010 22:01:03 -0800 Subject: ceph: allow connection to be reopened by fault callback Fix the messenger to allow a ceph_con_open() during the fault callback. Previously the work wasn't getting queued on the connection because the fault path avoids requeued work (normally spurious). Loop on reopening by checking for the OPENING state bit. This fixes OSD reconnects when a TCP connection drops. Signed-off-by: Sage Weil --- fs/ceph/messenger.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/ceph/messenger.c') diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c index 44bdaf43992..acf383f6a9c 100644 --- a/fs/ceph/messenger.c +++ b/fs/ceph/messenger.c @@ -1808,7 +1808,7 @@ done: clear_bit(BUSY, &con->state); dout("con->state=%lu\n", con->state); if (test_bit(QUEUED, &con->state)) { - if (!backoff) { + if (!backoff || test_bit(OPENING, &con->state)) { dout("con_work %p QUEUED reset, looping\n", con); goto more; } -- cgit v1.2.3 From 91e45ce38946a8efa21fefbc65d023ca3c0b434f Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 15 Feb 2010 12:05:09 -0800 Subject: ceph: cancel delayed work when closing connection This ensures that if/when we reopen the connection, we can requeue work on the connection immediately, without waiting for an old timer to expire. Queue new delayed work inside con->mutex to avoid any race. This fixes problems with clients failing to reconnect to the MDS due to the client_reconnect message arriving too late (due to waiting for an old delayed work timeout to expire). Signed-off-by: Sage Weil --- fs/ceph/messenger.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs/ceph/messenger.c') diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c index acf383f6a9c..ca2ad0e5bb2 100644 --- a/fs/ceph/messenger.c +++ b/fs/ceph/messenger.c @@ -344,6 +344,7 @@ void ceph_con_close(struct ceph_connection *con) clear_bit(STANDBY, &con->state); /* avoid connect_seq bump */ mutex_lock(&con->mutex); reset_connection(con); + cancel_delayed_work(&con->work); mutex_unlock(&con->mutex); queue_con(con); } @@ -1841,6 +1842,8 @@ static void ceph_fault(struct ceph_connection *con) clear_bit(BUSY, &con->state); /* to avoid an improbable race */ mutex_lock(&con->mutex); + if (test_bit(CLOSED, &con->state)) + goto out_unlock; con_close_socket(con); @@ -1876,8 +1879,6 @@ static void ceph_fault(struct ceph_connection *con) else if (con->delay < MAX_DELAY_INTERVAL) con->delay *= 2; - mutex_unlock(&con->mutex); - /* explicitly schedule work to try to reconnect again later. */ dout("fault queueing %p delay %lu\n", con, con->delay); con->ops->get(con); @@ -1885,6 +1886,8 @@ static void ceph_fault(struct ceph_connection *con) round_jiffies_relative(con->delay)) == 0) con->ops->put(con); +out_unlock: + mutex_unlock(&con->mutex); out: if (con->ops->fault) con->ops->fault(con); -- cgit v1.2.3 From 5b3a4db3e4009aff918abb1353eb3f4925393a7b Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 19 Feb 2010 21:43:23 -0800 Subject: ceph: fix up unexpected message handling Fix skipping of unexpected message types from osd, mon. Clean up pr_info and debug output. Signed-off-by: Sage Weil --- fs/ceph/messenger.c | 5 +++-- fs/ceph/mon_client.c | 14 +++++++++----- fs/ceph/osd_client.c | 41 +++++++++++++++++++++++++++++++---------- 3 files changed, 43 insertions(+), 17 deletions(-) (limited to 'fs/ceph/messenger.c') diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c index ca2ad0e5bb2..fdda707aa13 100644 --- a/fs/ceph/messenger.c +++ b/fs/ceph/messenger.c @@ -1361,7 +1361,7 @@ static int read_partial_message(struct ceph_connection *con) con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip); if (skip) { /* skip this message */ - pr_err("alloc_msg returned NULL, skipping message\n"); + dout("alloc_msg returned NULL, skipping message\n"); con->in_base_pos = -front_len - middle_len - data_len - sizeof(m->footer); con->in_tag = CEPH_MSGR_TAG_READY; @@ -1370,7 +1370,8 @@ static int read_partial_message(struct ceph_connection *con) if (IS_ERR(con->in_msg)) { ret = PTR_ERR(con->in_msg); con->in_msg = NULL; - con->error_msg = "error allocating memory for incoming message"; + con->error_msg = + "error allocating memory for incoming message"; return ret; } m = con->in_msg; diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c index 40d7d90bbed..890597c09d4 100644 --- a/fs/ceph/mon_client.c +++ b/fs/ceph/mon_client.c @@ -763,7 +763,7 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con, struct ceph_mon_client *monc = con->private; int type = le16_to_cpu(hdr->type); int front_len = le32_to_cpu(hdr->front_len); - struct ceph_msg *m; + struct ceph_msg *m = NULL; *skip = 0; @@ -777,13 +777,17 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con, case CEPH_MSG_AUTH_REPLY: m = ceph_msgpool_get(&monc->msgpool_auth_reply, front_len); break; - default: - return NULL; + case CEPH_MSG_MON_MAP: + case CEPH_MSG_MDS_MAP: + case CEPH_MSG_OSD_MAP: + m = ceph_msg_new(type, front_len, 0, 0, NULL); + break; } - if (!m) + if (!m) { + pr_info("alloc_msg unknown type %d\n", type); *skip = 1; - + } return m; } diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c index fa0f7370395..ffd819c5a5d 100644 --- a/fs/ceph/osd_client.c +++ b/fs/ceph/osd_client.c @@ -1396,31 +1396,30 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) ceph_msg_put(msg); } -static struct ceph_msg *alloc_msg(struct ceph_connection *con, +/* + * lookup and return message for incoming reply + */ +static struct ceph_msg *get_reply(struct ceph_connection *con, struct ceph_msg_header *hdr, int *skip) { struct ceph_osd *osd = con->private; struct ceph_osd_client *osdc = osd->o_osdc; - int type = le16_to_cpu(hdr->type); - int front = le32_to_cpu(hdr->front_len); - int data_len = le32_to_cpu(hdr->data_len); struct ceph_msg *m; struct ceph_osd_request *req; + int front = le32_to_cpu(hdr->front_len); + int data_len = le32_to_cpu(hdr->data_len); u64 tid; int err; - *skip = 0; - if (type != CEPH_MSG_OSD_OPREPLY) - return NULL; - tid = le64_to_cpu(hdr->tid); mutex_lock(&osdc->request_mutex); req = __lookup_request(osdc, tid); if (!req) { *skip = 1; m = NULL; - dout("alloc_msg unknown tid %llu\n", tid); + pr_info("alloc_msg unknown tid %llu from osd%d\n", tid, + osd->o_osd); goto out; } m = __get_next_reply(con, req, front); @@ -1437,11 +1436,33 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con, m = ERR_PTR(err); } } + *skip = 0; out: mutex_unlock(&osdc->request_mutex); - return m; + +} + +static struct ceph_msg *alloc_msg(struct ceph_connection *con, + struct ceph_msg_header *hdr, + int *skip) +{ + struct ceph_osd *osd = con->private; + int type = le16_to_cpu(hdr->type); + int front = le32_to_cpu(hdr->front_len); + + switch (type) { + case CEPH_MSG_OSD_MAP: + return ceph_msg_new(type, front, 0, 0, NULL); + case CEPH_MSG_OSD_OPREPLY: + return get_reply(con, hdr, skip); + default: + pr_info("alloc_msg unexpected msg type %d from osd%d\n", type, + osd->o_osd); + *skip = 1; + return NULL; + } } /* -- cgit v1.2.3 From 161fd65ac934608345aed35226fc889ea3b0b500 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 25 Feb 2010 12:38:57 -0800 Subject: ceph: invalidate_authorizer without con->mutex held This fixes lock ABBA inversion, as the ->invalidate_authorizer() op may need to take a lock (or even call back into the messenger). Signed-off-by: Sage Weil --- fs/ceph/messenger.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'fs/ceph/messenger.c') diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c index fdda707aa13..9ea7b763c8d 100644 --- a/fs/ceph/messenger.c +++ b/fs/ceph/messenger.c @@ -1853,14 +1853,6 @@ static void ceph_fault(struct ceph_connection *con) con->in_msg = NULL; } - /* - * in case we faulted due to authentication, invalidate our - * current tickets so that we can get new ones. - */ - if (con->auth_retry && con->ops->invalidate_authorizer) { - dout("calling invalidate_authorizer()\n"); - con->ops->invalidate_authorizer(con); - } /* If there are no messages in the queue, place the connection * in a STANDBY state (i.e., don't try to reconnect just yet). */ @@ -1890,6 +1882,15 @@ static void ceph_fault(struct ceph_connection *con) out_unlock: mutex_unlock(&con->mutex); out: + /* + * in case we faulted due to authentication, invalidate our + * current tickets so that we can get new ones. + */ + if (con->auth_retry && con->ops->invalidate_authorizer) { + dout("calling invalidate_authorizer()\n"); + con->ops->invalidate_authorizer(con); + } + if (con->ops->fault) con->ops->fault(con); } -- cgit v1.2.3 From e80a52d14f868059e8ec790c9fae88cdb8a1df98 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 25 Feb 2010 12:40:45 -0800 Subject: ceph: fix connection fault STANDBY check Move any out_sent messages to out_queue _before_ checking if out_queue is empty and going to STANDBY, or else we may drop something that was never acked. And clean up the code a bit (less goto). Signed-off-by: Sage Weil --- fs/ceph/messenger.c | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) (limited to 'fs/ceph/messenger.c') diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c index 9ea7b763c8d..0ddc2c75f6b 100644 --- a/fs/ceph/messenger.c +++ b/fs/ceph/messenger.c @@ -1853,32 +1853,27 @@ static void ceph_fault(struct ceph_connection *con) con->in_msg = NULL; } + /* Requeue anything that hasn't been acked */ + list_splice_init(&con->out_sent, &con->out_queue); /* If there are no messages in the queue, place the connection * in a STANDBY state (i.e., don't try to reconnect just yet). */ if (list_empty(&con->out_queue) && !con->out_keepalive_pending) { dout("fault setting STANDBY\n"); set_bit(STANDBY, &con->state); - mutex_unlock(&con->mutex); - goto out; + } else { + /* retry after a delay. */ + if (con->delay == 0) + con->delay = BASE_DELAY_INTERVAL; + else if (con->delay < MAX_DELAY_INTERVAL) + con->delay *= 2; + dout("fault queueing %p delay %lu\n", con, con->delay); + con->ops->get(con); + if (queue_delayed_work(ceph_msgr_wq, &con->work, + round_jiffies_relative(con->delay)) == 0) + con->ops->put(con); } - /* Requeue anything that hasn't been acked, and retry after a - * delay. */ - list_splice_init(&con->out_sent, &con->out_queue); - - if (con->delay == 0) - con->delay = BASE_DELAY_INTERVAL; - else if (con->delay < MAX_DELAY_INTERVAL) - con->delay *= 2; - - /* explicitly schedule work to try to reconnect again later. */ - dout("fault queueing %p delay %lu\n", con, con->delay); - con->ops->get(con); - if (queue_delayed_work(ceph_msgr_wq, &con->work, - round_jiffies_relative(con->delay)) == 0) - con->ops->put(con); - out_unlock: mutex_unlock(&con->mutex); out: -- cgit v1.2.3 From 1679f876a641d209e7b22e43ebda0693c71003cf Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 26 Feb 2010 13:55:51 -0800 Subject: ceph: reset bits on connection close Clear LOSSYTX bit, so that if/when we reconnect, said reconnect will retry on failure. Clear _PENDING bits too, to avoid polluting subsequent connection state. Drop unused REGISTERED bit. Signed-off-by: Sage Weil --- fs/ceph/messenger.c | 3 +++ fs/ceph/messenger.h | 1 - 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'fs/ceph/messenger.c') diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c index 0ddc2c75f6b..bf4590c77cf 100644 --- a/fs/ceph/messenger.c +++ b/fs/ceph/messenger.c @@ -342,6 +342,9 @@ void ceph_con_close(struct ceph_connection *con) dout("con_close %p peer %s\n", con, pr_addr(&con->peer_addr.in_addr)); set_bit(CLOSED, &con->state); /* in case there's queued work */ clear_bit(STANDBY, &con->state); /* avoid connect_seq bump */ + clear_bit(LOSSYTX, &con->state); /* so we retry next connect */ + clear_bit(KEEPALIVE_PENDING, &con->state); + clear_bit(WRITE_PENDING, &con->state); mutex_lock(&con->mutex); reset_connection(con); cancel_delayed_work(&con->work); diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h index c9735378be3..4caaa591111 100644 --- a/fs/ceph/messenger.h +++ b/fs/ceph/messenger.h @@ -119,7 +119,6 @@ struct ceph_msg_pos { * state with the peer. */ #define CLOSED 10 /* we've closed the connection */ #define SOCK_CLOSED 11 /* socket state changed to closed */ -#define REGISTERED 12 /* connection appears in con_tree */ #define OPENING 13 /* open connection w/ (possibly new) peer */ #define DEAD 14 /* dead, about to kfree */ -- cgit v1.2.3 From 3ca02ef96e119d36bc1752baeae7dd0c59c2f325 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 1 Mar 2010 15:25:00 -0800 Subject: ceph: reset front len on return to msgpool; BUG on mismatched front iov Reset msg front len when a message is returned to the pool: the caller may have changed it. BUG if we try to send a message with a hdr.front_len that doesn't match the front iov. Signed-off-by: Sage Weil --- fs/ceph/messenger.c | 2 ++ fs/ceph/msgpool.c | 4 ++++ 2 files changed, 6 insertions(+) (limited to 'fs/ceph/messenger.c') diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c index bf4590c77cf..781656a49bf 100644 --- a/fs/ceph/messenger.c +++ b/fs/ceph/messenger.c @@ -1954,6 +1954,8 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg) msg->hdr.src.addr = con->msgr->my_enc_addr; msg->hdr.orig_src = msg->hdr.src; + BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len)); + /* queue */ mutex_lock(&con->mutex); BUG_ON(!list_empty(&msg->list_head)); diff --git a/fs/ceph/msgpool.c b/fs/ceph/msgpool.c index 2f04e0fc466..ca3b44a89f2 100644 --- a/fs/ceph/msgpool.c +++ b/fs/ceph/msgpool.c @@ -166,6 +166,10 @@ void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg) { spin_lock(&pool->lock); if (pool->num < pool->min) { + /* reset msg front_len; user may have changed it */ + msg->front.iov_len = pool->front_len; + msg->hdr.front_len = cpu_to_le32(pool->front_len); + kref_set(&msg->kref, 1); /* retake a single ref */ list_add(&msg->list_head, &pool->msgs); pool->num++; -- cgit v1.2.3