1 | /* $NetBSD: bufq_priocscan.c,v 1.20 2016/11/16 10:42:14 pgoyette Exp $ */ |
2 | |
3 | /*- |
4 | * Copyright (c)2004,2005,2006,2008,2009,2011,2012 YAMAMOTO Takashi, |
5 | * All rights reserved. |
6 | * |
7 | * Redistribution and use in source and binary forms, with or without |
8 | * modification, are permitted provided that the following conditions |
9 | * are met: |
10 | * 1. Redistributions of source code must retain the above copyright |
11 | * notice, this list of conditions and the following disclaimer. |
12 | * 2. Redistributions in binary form must reproduce the above copyright |
13 | * notice, this list of conditions and the following disclaimer in the |
14 | * documentation and/or other materials provided with the distribution. |
15 | * |
16 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND |
17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
19 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE |
20 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
21 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
22 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
23 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
24 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
25 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
26 | * SUCH DAMAGE. |
27 | */ |
28 | |
29 | #include <sys/cdefs.h> |
30 | __KERNEL_RCSID(0, "$NetBSD: bufq_priocscan.c,v 1.20 2016/11/16 10:42:14 pgoyette Exp $" ); |
31 | |
32 | #include <sys/param.h> |
33 | #include <sys/systm.h> |
34 | #include <sys/buf.h> |
35 | #include <sys/bufq.h> |
36 | #include <sys/bufq_impl.h> |
37 | #include <sys/kmem.h> |
38 | #include <sys/rbtree.h> |
39 | #include <sys/module.h> |
40 | |
41 | #undef PRIOCSCAN_USE_GLOBAL_POSITION |
42 | |
43 | /* |
44 | * Cyclical scan (CSCAN) |
45 | */ |
46 | |
47 | struct cscan_key { |
48 | daddr_t k_rawblkno; |
49 | int k_cylinder; |
50 | }; |
51 | |
52 | struct cscan_queue { |
53 | rb_tree_t cq_buffers; /* ordered list of buffers */ |
54 | #if !defined(PRIOCSCAN_USE_GLOBAL_POSITION) |
55 | struct cscan_key cq_lastkey; /* key of last request */ |
56 | #endif /* !defined(PRIOCSCAN_USE_GLOBAL_POSITION) */ |
57 | int cq_sortby; /* BUFQ_SORT_MASK */ |
58 | rb_tree_ops_t cq_ops; |
59 | }; |
60 | |
61 | static signed int |
62 | buf_cmp(const struct buf *b1, const struct buf *b2, int sortby) |
63 | { |
64 | |
65 | if (buf_inorder(b2, b1, sortby)) { |
66 | return 1; /* b1 > b2 */ |
67 | } |
68 | if (buf_inorder(b1, b2, sortby)) { |
69 | return -1; /* b1 < b2 */ |
70 | } |
71 | return 0; |
72 | } |
73 | |
74 | /* return positive if n1 > n2 */ |
75 | static signed int |
76 | cscan_tree_compare_nodes(void *context, const void *n1, const void *n2) |
77 | { |
78 | const struct cscan_queue * const q = context; |
79 | const struct buf * const b1 = n1; |
80 | const struct buf * const b2 = n2; |
81 | const int sortby = q->cq_sortby; |
82 | const int diff = buf_cmp(b1, b2, sortby); |
83 | |
84 | /* |
85 | * XXX rawblkno/cylinder might not be unique. eg. unbuffered i/o |
86 | */ |
87 | |
88 | if (diff != 0) { |
89 | return diff; |
90 | } |
91 | |
92 | /* |
93 | * XXX rawblkno/cylinder might not be unique. eg. unbuffered i/o |
94 | */ |
95 | if (b1 > b2) { |
96 | return 1; |
97 | } |
98 | if (b1 < b2) { |
99 | return -1; |
100 | } |
101 | return 0; |
102 | } |
103 | |
104 | /* return positive if n1 > k2 */ |
105 | static signed int |
106 | cscan_tree_compare_key(void *context, const void *n1, const void *k2) |
107 | { |
108 | const struct cscan_queue * const q = context; |
109 | const struct buf * const b1 = n1; |
110 | const struct cscan_key * const key = k2; |
111 | const struct buf tmp = { |
112 | .b_rawblkno = key->k_rawblkno, |
113 | .b_cylinder = key->k_cylinder, |
114 | }; |
115 | const struct buf *b2 = &tmp; |
116 | const int sortby = q->cq_sortby; |
117 | |
118 | return buf_cmp(b1, b2, sortby); |
119 | } |
120 | |
121 | static void __unused |
122 | cscan_dump(struct cscan_queue *cq) |
123 | { |
124 | const int sortby = cq->cq_sortby; |
125 | struct buf *bp; |
126 | |
127 | RB_TREE_FOREACH(bp, &cq->cq_buffers) { |
128 | if (sortby == BUFQ_SORT_RAWBLOCK) { |
129 | printf(" %jd" , (intmax_t)bp->b_rawblkno); |
130 | } else { |
131 | printf(" %jd/%jd" , |
132 | (intmax_t)bp->b_cylinder, (intmax_t)bp->b_rawblkno); |
133 | } |
134 | } |
135 | } |
136 | |
137 | static inline bool |
138 | cscan_empty(struct cscan_queue *q) |
139 | { |
140 | |
141 | /* XXX this might do more work than necessary */ |
142 | return rb_tree_iterate(&q->cq_buffers, NULL, RB_DIR_LEFT) == NULL; |
143 | } |
144 | |
145 | static void |
146 | cscan_put(struct cscan_queue *q, struct buf *bp) |
147 | { |
148 | struct buf *obp __diagused; |
149 | |
150 | obp = rb_tree_insert_node(&q->cq_buffers, bp); |
151 | KASSERT(obp == bp); /* see cscan_tree_compare_nodes */ |
152 | } |
153 | |
154 | static struct buf * |
155 | cscan_get(struct cscan_queue *q, int remove, struct cscan_key *key) |
156 | { |
157 | struct buf *bp; |
158 | |
159 | bp = rb_tree_find_node_geq(&q->cq_buffers, key); |
160 | KDASSERT(bp == NULL || cscan_tree_compare_key(q, bp, key) >= 0); |
161 | if (bp == NULL) { |
162 | bp = rb_tree_iterate(&q->cq_buffers, NULL, RB_DIR_LEFT); |
163 | KDASSERT(cscan_tree_compare_key(q, bp, key) < 0); |
164 | } |
165 | if (bp != NULL && remove) { |
166 | #if defined(DEBUG) |
167 | struct buf *nbp; |
168 | #endif /* defined(DEBUG) */ |
169 | |
170 | rb_tree_remove_node(&q->cq_buffers, bp); |
171 | /* |
172 | * remember the head position. |
173 | */ |
174 | key->k_cylinder = bp->b_cylinder; |
175 | key->k_rawblkno = bp->b_rawblkno + (bp->b_bcount >> DEV_BSHIFT); |
176 | #if defined(DEBUG) |
177 | nbp = rb_tree_find_node_geq(&q->cq_buffers, key); |
178 | if (nbp != NULL && cscan_tree_compare_nodes(q, nbp, bp) < 0) { |
179 | panic("%s: wrong order %p < %p\n" , __func__, |
180 | nbp, bp); |
181 | } |
182 | #endif /* defined(DEBUG) */ |
183 | } |
184 | return bp; |
185 | } |
186 | |
187 | static void |
188 | cscan_init(struct cscan_queue *q, int sortby) |
189 | { |
190 | static const rb_tree_ops_t cscan_tree_ops = { |
191 | .rbto_compare_nodes = cscan_tree_compare_nodes, |
192 | .rbto_compare_key = cscan_tree_compare_key, |
193 | .rbto_node_offset = offsetof(struct buf, b_u.u_rbnode), |
194 | .rbto_context = NULL, |
195 | }; |
196 | |
197 | q->cq_sortby = sortby; |
198 | /* XXX copy ops to workaround rbtree.h API limitation */ |
199 | q->cq_ops = cscan_tree_ops; |
200 | q->cq_ops.rbto_context = q; |
201 | rb_tree_init(&q->cq_buffers, &q->cq_ops); |
202 | } |
203 | |
204 | /* |
205 | * Per-prioritiy CSCAN. |
206 | * |
207 | * XXX probably we should have a way to raise |
208 | * priority of the on-queue requests. |
209 | */ |
210 | #define PRIOCSCAN_NQUEUE 3 |
211 | |
212 | struct priocscan_queue { |
213 | struct cscan_queue q_queue; |
214 | unsigned int q_burst; |
215 | }; |
216 | |
217 | struct bufq_priocscan { |
218 | struct priocscan_queue bq_queue[PRIOCSCAN_NQUEUE]; |
219 | |
220 | #if defined(PRIOCSCAN_USE_GLOBAL_POSITION) |
221 | /* |
222 | * XXX using "global" head position can reduce positioning time |
223 | * when switching between queues. |
224 | * although it might affect against fairness. |
225 | */ |
226 | struct cscan_key bq_lastkey; |
227 | #endif |
228 | }; |
229 | |
230 | /* |
231 | * how many requests to serve when having pending requests on other queues. |
232 | * |
233 | * XXX tune |
234 | * be careful: while making these values larger likely |
235 | * increases the total throughput, it can also increase latencies |
236 | * for some workloads. |
237 | */ |
238 | const int priocscan_burst[] = { |
239 | 64, 16, 4 |
240 | }; |
241 | |
242 | static void bufq_priocscan_init(struct bufq_state *); |
243 | static void bufq_priocscan_put(struct bufq_state *, struct buf *); |
244 | static struct buf *bufq_priocscan_get(struct bufq_state *, int); |
245 | |
246 | BUFQ_DEFINE(priocscan, 40, bufq_priocscan_init); |
247 | |
248 | static inline struct cscan_queue *bufq_priocscan_selectqueue( |
249 | struct bufq_priocscan *, const struct buf *); |
250 | |
251 | static inline struct cscan_queue * |
252 | bufq_priocscan_selectqueue(struct bufq_priocscan *q, const struct buf *bp) |
253 | { |
254 | static const int priocscan_priomap[] = { |
255 | [BPRIO_TIMENONCRITICAL] = 2, |
256 | [BPRIO_TIMELIMITED] = 1, |
257 | [BPRIO_TIMECRITICAL] = 0 |
258 | }; |
259 | |
260 | return &q->bq_queue[priocscan_priomap[BIO_GETPRIO(bp)]].q_queue; |
261 | } |
262 | |
263 | static void |
264 | bufq_priocscan_put(struct bufq_state *bufq, struct buf *bp) |
265 | { |
266 | struct bufq_priocscan *q = bufq->bq_private; |
267 | struct cscan_queue *cq; |
268 | |
269 | cq = bufq_priocscan_selectqueue(q, bp); |
270 | cscan_put(cq, bp); |
271 | } |
272 | |
273 | static struct buf * |
274 | bufq_priocscan_get(struct bufq_state *bufq, int remove) |
275 | { |
276 | struct bufq_priocscan *q = bufq->bq_private; |
277 | struct priocscan_queue *pq, *npq; |
278 | struct priocscan_queue *first; /* highest priority non-empty queue */ |
279 | const struct priocscan_queue *epq; |
280 | struct buf *bp; |
281 | bool single; /* true if there's only one non-empty queue */ |
282 | |
283 | /* |
284 | * find the highest priority non-empty queue. |
285 | */ |
286 | pq = &q->bq_queue[0]; |
287 | epq = pq + PRIOCSCAN_NQUEUE; |
288 | for (; pq < epq; pq++) { |
289 | if (!cscan_empty(&pq->q_queue)) { |
290 | break; |
291 | } |
292 | } |
293 | if (pq == epq) { |
294 | /* |
295 | * all our queues are empty. there's nothing to serve. |
296 | */ |
297 | return NULL; |
298 | } |
299 | first = pq; |
300 | |
301 | /* |
302 | * scan the rest of queues. |
303 | * |
304 | * if we have two or more non-empty queues, we serve the highest |
305 | * priority one with non-zero burst count. |
306 | */ |
307 | single = true; |
308 | for (npq = pq + 1; npq < epq; npq++) { |
309 | if (!cscan_empty(&npq->q_queue)) { |
310 | /* |
311 | * we found another non-empty queue. |
312 | * it means that a queue needs to consume its burst |
313 | * count to be served. |
314 | */ |
315 | single = false; |
316 | |
317 | /* |
318 | * check if our current candidate queue has already |
319 | * exhausted its burst count. |
320 | */ |
321 | if (pq->q_burst > 0) { |
322 | break; |
323 | } |
324 | pq = npq; |
325 | } |
326 | } |
327 | if (single) { |
328 | /* |
329 | * there's only a non-empty queue. |
330 | * just serve it without consuming its burst count. |
331 | */ |
332 | KASSERT(pq == first); |
333 | } else { |
334 | /* |
335 | * there are two or more non-empty queues. |
336 | */ |
337 | if (pq->q_burst == 0) { |
338 | /* |
339 | * no queues can be served because they have already |
340 | * exhausted their burst count. |
341 | */ |
342 | unsigned int i; |
343 | #ifdef DEBUG |
344 | for (i = 0; i < PRIOCSCAN_NQUEUE; i++) { |
345 | pq = &q->bq_queue[i]; |
346 | if (!cscan_empty(&pq->q_queue) && pq->q_burst) { |
347 | panic("%s: inconsist" , __func__); |
348 | } |
349 | } |
350 | #endif /* DEBUG */ |
351 | /* |
352 | * reset burst counts. |
353 | */ |
354 | if (remove) { |
355 | for (i = 0; i < PRIOCSCAN_NQUEUE; i++) { |
356 | pq = &q->bq_queue[i]; |
357 | pq->q_burst = priocscan_burst[i]; |
358 | } |
359 | } |
360 | |
361 | /* |
362 | * serve the highest priority non-empty queue. |
363 | */ |
364 | pq = first; |
365 | } |
366 | /* |
367 | * consume the burst count. |
368 | * |
369 | * XXX account only by number of requests. is it good enough? |
370 | */ |
371 | if (remove) { |
372 | KASSERT(pq->q_burst > 0); |
373 | pq->q_burst--; |
374 | } |
375 | } |
376 | |
377 | /* |
378 | * finally, get a request from the selected queue. |
379 | */ |
380 | KDASSERT(!cscan_empty(&pq->q_queue)); |
381 | bp = cscan_get(&pq->q_queue, remove, |
382 | #if defined(PRIOCSCAN_USE_GLOBAL_POSITION) |
383 | &q->bq_lastkey |
384 | #else /* defined(PRIOCSCAN_USE_GLOBAL_POSITION) */ |
385 | &pq->q_queue.cq_lastkey |
386 | #endif /* defined(PRIOCSCAN_USE_GLOBAL_POSITION) */ |
387 | ); |
388 | KDASSERT(bp != NULL); |
389 | KDASSERT(&pq->q_queue == bufq_priocscan_selectqueue(q, bp)); |
390 | |
391 | return bp; |
392 | } |
393 | |
394 | static struct buf * |
395 | bufq_priocscan_cancel(struct bufq_state *bufq, struct buf *bp) |
396 | { |
397 | struct bufq_priocscan * const q = bufq->bq_private; |
398 | unsigned int i; |
399 | |
400 | for (i = 0; i < PRIOCSCAN_NQUEUE; i++) { |
401 | struct cscan_queue * const cq = &q->bq_queue[i].q_queue; |
402 | struct buf *it; |
403 | |
404 | /* |
405 | * XXX probably could be faster but the cancel functionality |
406 | * is not widely used anyway. |
407 | */ |
408 | RB_TREE_FOREACH(it, &cq->cq_buffers) { |
409 | if (it == bp) { |
410 | rb_tree_remove_node(&cq->cq_buffers, bp); |
411 | return bp; |
412 | } |
413 | } |
414 | } |
415 | return NULL; |
416 | } |
417 | |
418 | static void |
419 | bufq_priocscan_fini(struct bufq_state *bufq) |
420 | { |
421 | |
422 | KASSERT(bufq->bq_private != NULL); |
423 | kmem_free(bufq->bq_private, sizeof(struct bufq_priocscan)); |
424 | } |
425 | |
426 | static void |
427 | bufq_priocscan_init(struct bufq_state *bufq) |
428 | { |
429 | struct bufq_priocscan *q; |
430 | const int sortby = bufq->bq_flags & BUFQ_SORT_MASK; |
431 | unsigned int i; |
432 | |
433 | bufq->bq_get = bufq_priocscan_get; |
434 | bufq->bq_put = bufq_priocscan_put; |
435 | bufq->bq_cancel = bufq_priocscan_cancel; |
436 | bufq->bq_fini = bufq_priocscan_fini; |
437 | bufq->bq_private = kmem_zalloc(sizeof(struct bufq_priocscan), KM_SLEEP); |
438 | |
439 | q = bufq->bq_private; |
440 | for (i = 0; i < PRIOCSCAN_NQUEUE; i++) { |
441 | struct cscan_queue *cq = &q->bq_queue[i].q_queue; |
442 | |
443 | cscan_init(cq, sortby); |
444 | } |
445 | } |
446 | |
447 | MODULE(MODULE_CLASS_BUFQ, bufq_priocscan, NULL); |
448 | |
449 | static int |
450 | bufq_priocscan_modcmd(modcmd_t cmd, void *opaque) |
451 | { |
452 | |
453 | switch (cmd) { |
454 | case MODULE_CMD_INIT: |
455 | return bufq_register(&bufq_strat_priocscan); |
456 | case MODULE_CMD_FINI: |
457 | return bufq_unregister(&bufq_strat_priocscan); |
458 | default: |
459 | return ENOTTY; |
460 | } |
461 | } |
462 | |