epoll后端机制的实现代码在epoll.c文件中。
1 /* 2 * Copyright 2000-2007 Niels Provos3 * Copyright 2007-2012 Niels Provos, Nick Mathewson 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. The name of the author may not be used to endorse or promote products 14 * derived from this software without specific prior written permission. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 21 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 22 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 23 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 #include "event2/event-config.h" 28 #include "evconfig-private.h" 29 30 #ifdef EVENT__HAVE_EPOLL 31 32 #include 33 #include 34 #include 35 #ifdef EVENT__HAVE_SYS_TIME_H 36 #include 37 #endif 38 #include 39 #include 40 #include 41 #include 42 #include 43 #include 44 #include 45 #include 46 #include 47 #ifdef EVENT__HAVE_FCNTL_H 48 #include 49 #endif 50 #ifdef EVENT__HAVE_SYS_TIMERFD_H 51 #include 52 #endif 53 54 #include "event-internal.h" 55 #include "evsignal-internal.h" 56 #include "event2/thread.h" 57 #include "evthread-internal.h" 58 #include "log-internal.h" 59 #include "evmap-internal.h" 60 #include "changelist-internal.h" 61 #include "time-internal.h" 62 63 /* Since Linux 2.6.17, epoll is able to report about peer half-closed connection 64 using special EPOLLRDHUP flag on a read event. 65 */ 66 #if !defined(EPOLLRDHUP) 67 #define EPOLLRDHUP 0 68 #define EARLY_CLOSE_IF_HAVE_RDHUP 0 69 #else 70 #define EARLY_CLOSE_IF_HAVE_RDHUP EV_FEATURE_EARLY_CLOSE 71 #endif 72 73 #include "epolltable-internal.h" 74 75 #if defined(EVENT__HAVE_SYS_TIMERFD_H) && \ 76 defined(EVENT__HAVE_TIMERFD_CREATE) && \ 77 defined(HAVE_POSIX_MONOTONIC) && defined(TFD_NONBLOCK) && \ 78 defined(TFD_CLOEXEC) 79 /* Note that we only use timerfd if TFD_NONBLOCK and TFD_CLOEXEC are available 80 and working. This means that we can't support it on 2.6.25 (where timerfd 81 was introduced) or 2.6.26, since 2.6.27 introduced those flags. 82 */ 83 #define USING_TIMERFD 84 #endif 85 86 struct epollop { 87 struct epoll_event *events; 88 int nevents; 89 int epfd; 90 #ifdef USING_TIMERFD 91 int timerfd; 92 #endif 93 }; 94 95 static void *epoll_init(struct event_base *); 96 static int epoll_dispatch(struct event_base *, struct timeval *); 97 static void epoll_dealloc(struct event_base *); 98 99 static const struct eventop epollops_changelist = {100 "epoll (with changelist)",101 epoll_init,102 event_changelist_add_,103 event_changelist_del_,104 epoll_dispatch,105 epoll_dealloc,106 1, /* need reinit */107 EV_FEATURE_ET|EV_FEATURE_O1| EARLY_CLOSE_IF_HAVE_RDHUP,108 EVENT_CHANGELIST_FDINFO_SIZE109 };110 111 112 static int epoll_nochangelist_add(struct event_base *base, evutil_socket_t fd,113 short old, short events, void *p);114 static int epoll_nochangelist_del(struct event_base *base, evutil_socket_t fd,115 short old, short events, void *p);116 117 const struct eventop epollops = {118 "epoll",119 epoll_init,120 epoll_nochangelist_add,121 epoll_nochangelist_del,122 epoll_dispatch,123 epoll_dealloc,124 1, /* need reinit */125 EV_FEATURE_ET|EV_FEATURE_O1|EV_FEATURE_EARLY_CLOSE,126 0127 };128 129 #define INITIAL_NEVENT 32130 #define MAX_NEVENT 4096131 132 /* On Linux kernels at least up to 2.6.24.4, epoll can't handle timeout133 * values bigger than (LONG_MAX - 999ULL)/HZ. HZ in the wild can be134 * as big as 1000, and LONG_MAX can be as small as (1<<31)-1, so the135 * largest number of msec we can support here is 2147482. Let's136 * round that down by 47 seconds.137 */138 #define MAX_EPOLL_TIMEOUT_MSEC (35*60*1000)139 140 static void *141 epoll_init(struct event_base *base)142 {143 int epfd = -1;144 struct epollop *epollop;145 146 #ifdef EVENT__HAVE_EPOLL_CREATE1147 /* First, try the shiny new epoll_create1 interface, if we have it. */148 epfd = epoll_create1(EPOLL_CLOEXEC);149 #endif150 if (epfd == -1) {151 /* Initialize the kernel queue using the old interface. (The152 size field is ignored since 2.6.8.) */153 if ((epfd = epoll_create(32000)) == -1) {154 if (errno != ENOSYS)155 event_warn("epoll_create");156 return (NULL);157 }158 evutil_make_socket_closeonexec(epfd);159 }160 161 if (!(epollop = mm_calloc(1, sizeof(struct epollop)))) {162 close(epfd);163 return (NULL);164 }165 166 epollop->epfd = epfd;167 168 /* Initialize fields */169 epollop->events = mm_calloc(INITIAL_NEVENT, sizeof(struct epoll_event));170 if (epollop->events == NULL) {171 mm_free(epollop);172 close(epfd);173 return (NULL);174 }175 epollop->nevents = INITIAL_NEVENT;176 177 if ((base->flags & EVENT_BASE_FLAG_EPOLL_USE_CHANGELIST) != 0 ||178 ((base->flags & EVENT_BASE_FLAG_IGNORE_ENV) == 0 &&179 evutil_getenv_("EVENT_EPOLL_USE_CHANGELIST") != NULL)) {180 181 base->evsel = &epollops_changelist;182 }183 184 #ifdef USING_TIMERFD185 /*186 The epoll interface ordinarily gives us one-millisecond precision,187 so on Linux it makes perfect sense to use the CLOCK_MONOTONIC_COARSE188 timer. But when the user has set the new PRECISE_TIMER flag for an189 event_base, we can try to use timerfd to give them finer granularity.190 */191 if ((base->flags & EVENT_BASE_FLAG_PRECISE_TIMER) &&192 base->monotonic_timer.monotonic_clock == CLOCK_MONOTONIC) {193 int fd;194 fd = epollop->timerfd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK|TFD_CLOEXEC);195 if (epollop->timerfd >= 0) {196 struct epoll_event epev;197 memset(&epev, 0, sizeof(epev));198 epev.data.fd = epollop->timerfd;199 epev.events = EPOLLIN;200 if (epoll_ctl(epollop->epfd, EPOLL_CTL_ADD, fd, &epev) < 0) {201 event_warn("epoll_ctl(timerfd)");202 close(fd);203 epollop->timerfd = -1;204 }205 } else {206 if (errno != EINVAL && errno != ENOSYS) {207 /* These errors probably mean that we were208 * compiled with timerfd/TFD_* support, but209 * we're running on a kernel that lacks those.210 */211 event_warn("timerfd_create");212 }213 epollop->timerfd = -1;214 }215 } else {216 epollop->timerfd = -1;217 }218 #endif219 220 evsig_init_(base);221 222 return (epollop);223 }224 225 static const char *226 change_to_string(int change)227 {228 change &= (EV_CHANGE_ADD|EV_CHANGE_DEL);229 if (change == EV_CHANGE_ADD) {230 return "add";231 } else if (change == EV_CHANGE_DEL) {232 return "del";233 } else if (change == 0) {234 return "none";235 } else {236 return "???";237 }238 }239 240 static const char *241 epoll_op_to_string(int op)242 {243 return op == EPOLL_CTL_ADD?"ADD":244 op == EPOLL_CTL_DEL?"DEL":245 op == EPOLL_CTL_MOD?"MOD":246 "???";247 }248 249 #define PRINT_CHANGES(op, events, ch, status) \250 "Epoll %s(%d) on fd %d " status ". " \251 "Old events were %d; " \252 "read change was %d (%s); " \253 "write change was %d (%s); " \254 "close change was %d (%s)", \255 epoll_op_to_string(op), \256 events, \257 ch->fd, \258 ch->old_events, \259 ch->read_change, \260 change_to_string(ch->read_change), \261 ch->write_change, \262 change_to_string(ch->write_change), \263 ch->close_change, \264 change_to_string(ch->close_change)265 266 static int267 epoll_apply_one_change(struct event_base *base,268 struct epollop *epollop,269 const struct event_change *ch)270 {271 struct epoll_event epev;272 int op, events = 0;273 int idx;274 275 idx = EPOLL_OP_TABLE_INDEX(ch);276 op = epoll_op_table[idx].op;277 events = epoll_op_table[idx].events;278 279 if (!events) {280 EVUTIL_ASSERT(op == 0);281 return 0;282 }283 284 if ((ch->read_change|ch->write_change) & EV_CHANGE_ET)285 events |= EPOLLET;286 287 memset(&epev, 0, sizeof(epev));288 epev.data.fd = ch->fd;289 epev.events = events;290 if (epoll_ctl(epollop->epfd, op, ch->fd, &epev) == 0) {291 event_debug((PRINT_CHANGES(op, epev.events, ch, "okay")));292 return 0;293 }294 295 switch (op) {296 case EPOLL_CTL_MOD:297 if (errno == ENOENT) {298 /* If a MOD operation fails with ENOENT, the299 * fd was probably closed and re-opened. We300 * should retry the operation as an ADD.301 */302 if (epoll_ctl(epollop->epfd, EPOLL_CTL_ADD, ch->fd, &epev) == -1) {303 event_warn("Epoll MOD(%d) on %d retried as ADD; that failed too",304 (int)epev.events, ch->fd);305 return -1;306 } else {307 event_debug(("Epoll MOD(%d) on %d retried as ADD; succeeded.",308 (int)epev.events,309 ch->fd));310 return 0;311 }312 }313 break;314 case EPOLL_CTL_ADD:315 if (errno == EEXIST) {316 /* If an ADD operation fails with EEXIST,317 * either the operation was redundant (as with a318 * precautionary add), or we ran into a fun319 * kernel bug where using dup*() to duplicate the320 * same file into the same fd gives you the same epitem321 * rather than a fresh one. For the second case,322 * we must retry with MOD. */323 if (epoll_ctl(epollop->epfd, EPOLL_CTL_MOD, ch->fd, &epev) == -1) {324 event_warn("Epoll ADD(%d) on %d retried as MOD; that failed too",325 (int)epev.events, ch->fd);326 return -1;327 } else {328 event_debug(("Epoll ADD(%d) on %d retried as MOD; succeeded.",329 (int)epev.events,330 ch->fd));331 return 0;332 }333 }334 break;335 case EPOLL_CTL_DEL:336 if (errno == ENOENT || errno == EBADF || errno == EPERM) {337 /* If a delete fails with one of these errors,338 * that's fine too: we closed the fd before we339 * got around to calling epoll_dispatch. */340 event_debug(("Epoll DEL(%d) on fd %d gave %s: DEL was unnecessary.",341 (int)epev.events,342 ch->fd,343 strerror(errno)));344 return 0;345 }346 break;347 default:348 break;349 }350 351 event_warn(PRINT_CHANGES(op, epev.events, ch, "failed"));352 return -1;353 }354 355 static int356 epoll_apply_changes(struct event_base *base)357 {358 struct event_changelist *changelist = &base->changelist;359 struct epollop *epollop = base->evbase;360 struct event_change *ch;361 362 int r = 0;363 int i;364 365 for (i = 0; i < changelist->n_changes; ++i) {366 ch = &changelist->changes[i];367 if (epoll_apply_one_change(base, epollop, ch) < 0)368 r = -1;369 }370 371 return (r);372 }373 374 static int375 epoll_nochangelist_add(struct event_base *base, evutil_socket_t fd,376 short old, short events, void *p)377 {378 struct event_change ch;379 ch.fd = fd;380 ch.old_events = old;381 ch.read_change = ch.write_change = ch.close_change = 0;382 if (events & EV_WRITE)383 ch.write_change = EV_CHANGE_ADD |384 (events & EV_ET);385 if (events & EV_READ)386 ch.read_change = EV_CHANGE_ADD |387 (events & EV_ET);388 if (events & EV_CLOSED)389 ch.close_change = EV_CHANGE_ADD |390 (events & EV_ET);391 392 return epoll_apply_one_change(base, base->evbase, &ch);393 }394 395 static int396 epoll_nochangelist_del(struct event_base *base, evutil_socket_t fd,397 short old, short events, void *p)398 {399 struct event_change ch;400 ch.fd = fd;401 ch.old_events = old;402 ch.read_change = ch.write_change = ch.close_change = 0;403 if (events & EV_WRITE)404 ch.write_change = EV_CHANGE_DEL;405 if (events & EV_READ)406 ch.read_change = EV_CHANGE_DEL;407 if (events & EV_CLOSED)408 ch.close_change = EV_CHANGE_DEL;409 410 return epoll_apply_one_change(base, base->evbase, &ch);411 }412 413 static int414 epoll_dispatch(struct event_base *base, struct timeval *tv)415 {416 struct epollop *epollop = base->evbase;417 struct epoll_event *events = epollop->events;418 int i, res;419 long timeout = -1;420 421 #ifdef USING_TIMERFD422 if (epollop->timerfd >= 0) {423 struct itimerspec is;424 is.it_interval.tv_sec = 0;425 is.it_interval.tv_nsec = 0;426 if (tv == NULL) {427 /* No timeout; disarm the timer. */428 is.it_value.tv_sec = 0;429 is.it_value.tv_nsec = 0;430 } else {431 if (tv->tv_sec == 0 && tv->tv_usec == 0) {432 /* we need to exit immediately; timerfd can't433 * do that. */434 timeout = 0;435 }436 is.it_value.tv_sec = tv->tv_sec;437 is.it_value.tv_nsec = tv->tv_usec * 1000;438 }439 /* TODO: we could avoid unnecessary syscalls here by only440 calling timerfd_settime when the top timeout changes, or441 when we're called with a different timeval.442 */443 if (timerfd_settime(epollop->timerfd, 0, &is, NULL) < 0) {444 event_warn("timerfd_settime");445 }446 } else447 #endif448 if (tv != NULL) {449 timeout = evutil_tv_to_msec_(tv);450 if (timeout < 0 || timeout > MAX_EPOLL_TIMEOUT_MSEC) {451 /* Linux kernels can wait forever if the timeout is452 * too big; see comment on MAX_EPOLL_TIMEOUT_MSEC. */453 timeout = MAX_EPOLL_TIMEOUT_MSEC;454 }455 }456 457 epoll_apply_changes(base);458 event_changelist_remove_all_(&base->changelist, base);459 460 EVBASE_RELEASE_LOCK(base, th_base_lock);461 462 res = epoll_wait(epollop->epfd, events, epollop->nevents, timeout);463 464 EVBASE_ACQUIRE_LOCK(base, th_base_lock);465 466 if (res == -1) {467 if (errno != EINTR) {468 event_warn("epoll_wait");469 return (-1);470 }471 472 return (0);473 }474 475 event_debug(("%s: epoll_wait reports %d", __func__, res));476 EVUTIL_ASSERT(res <= epollop->nevents);477 478 for (i = 0; i < res; i++) {479 int what = events[i].events;480 short ev = 0;481 #ifdef USING_TIMERFD482 if (events[i].data.fd == epollop->timerfd)483 continue;484 #endif485 486 if (what & (EPOLLHUP|EPOLLERR)) {487 ev = EV_READ | EV_WRITE;488 } else {489 if (what & EPOLLIN)490 ev |= EV_READ;491 if (what & EPOLLOUT)492 ev |= EV_WRITE;493 if (what & EPOLLRDHUP)494 ev |= EV_CLOSED;495 }496 497 if (!ev)498 continue;499 500 evmap_io_active_(base, events[i].data.fd, ev | EV_ET);501 }502 503 if (res == epollop->nevents && epollop->nevents < MAX_NEVENT) {504 /* We used all of the event space this time. We should505 be ready for more events next time. */506 int new_nevents = epollop->nevents * 2;507 struct epoll_event *new_events;508 509 new_events = mm_realloc(epollop->events,510 new_nevents * sizeof(struct epoll_event));511 if (new_events) {512 epollop->events = new_events;513 epollop->nevents = new_nevents;514 }515 }516 517 return (0);518 }519 520 521 static void522 epoll_dealloc(struct event_base *base)523 {524 struct epollop *epollop = base->evbase;525 526 evsig_dealloc_(base);527 if (epollop->events)528 mm_free(epollop->events);529 if (epollop->epfd >= 0)530 close(epollop->epfd);531 #ifdef USING_TIMERFD532 if (epollop->timerfd >= 0)533 close(epollop->timerfd);534 #endif535 536 memset(epollop, 0, sizeof(struct epollop));537 mm_free(epollop);538 }539 540 #endif /* EVENT__HAVE_EPOLL */
(1)第117行-127行定义的epollops对应了这篇里说的epoll后端机制的定义。
(2)该文件中定义了epoll_init函数用于初始化、epoll_add函数用于添加一个事件、epoll_del函数用于删除一个事件、epoll_dispatch用于事件循环。
1、epoll_init函数
1)调用epoll_create创建epfd。
2)在堆上分配一个struct epollop结构epollop。
3)把epollop的成员epfd赋值为刚才创建的epfd。
4)初始化成员events,调用mm_malloc函数来分配。
5)初始化成员nevents为INITIAL_NEVENT。
6)如果定义了USING_TIMERFD宏,就初始化成员timerfd。
7)调用svsig_init_函数。
8)返回epollop。
2、epoll_nochanglist_add函数
1)判断read、write、close是否有改变。
2)调用epoll_apply_one_change函数,在该函数中首先调用epool_ctl修改事件,然后处理各种异常情况,比如:ENOENT、EEXIST等等。
3、epoll_nochangelist_del函数
1)判断read、write、close是否有删除。
2)调用函数epoll_apply_one_change函数。
4、epoll_dispatch函数
1)通过event_base结构的evbase获取epollop指针,然后获取到初始化时传入的events指针并保存在events中。
2)获取timeout。
3)调用epoll_wait函数。
4)在一个for循环中处理激活事件,在每一次循环中,先把epoll事件转换为libevent定义的事件,EPOLLIN->EV_READ,EPOLLOUT->EV_WRITE,EPOLLRDHUP->EV_CLOSED,然后调用函数。
5)判断如果用完了所有事件,则为下一次准备更多的事件,扩展为原来的2倍,第一次默认是32。