1*4882a593Smuzhiyun=================== 2*4882a593SmuzhiyunBlock io priorities 3*4882a593Smuzhiyun=================== 4*4882a593Smuzhiyun 5*4882a593Smuzhiyun 6*4882a593SmuzhiyunIntro 7*4882a593Smuzhiyun----- 8*4882a593Smuzhiyun 9*4882a593SmuzhiyunWith the introduction of cfq v3 (aka cfq-ts or time sliced cfq), basic io 10*4882a593Smuzhiyunpriorities are supported for reads on files. This enables users to io nice 11*4882a593Smuzhiyunprocesses or process groups, similar to what has been possible with cpu 12*4882a593Smuzhiyunscheduling for ages. This document mainly details the current possibilities 13*4882a593Smuzhiyunwith cfq; other io schedulers do not support io priorities thus far. 14*4882a593Smuzhiyun 15*4882a593SmuzhiyunScheduling classes 16*4882a593Smuzhiyun------------------ 17*4882a593Smuzhiyun 18*4882a593SmuzhiyunCFQ implements three generic scheduling classes that determine how io is 19*4882a593Smuzhiyunserved for a process. 20*4882a593Smuzhiyun 21*4882a593SmuzhiyunIOPRIO_CLASS_RT: This is the realtime io class. This scheduling class is given 22*4882a593Smuzhiyunhigher priority than any other in the system, processes from this class are 23*4882a593Smuzhiyungiven first access to the disk every time. Thus it needs to be used with some 24*4882a593Smuzhiyuncare, one io RT process can starve the entire system. Within the RT class, 25*4882a593Smuzhiyunthere are 8 levels of class data that determine exactly how much time this 26*4882a593Smuzhiyunprocess needs the disk for on each service. In the future this might change 27*4882a593Smuzhiyunto be more directly mappable to performance, by passing in a wanted data 28*4882a593Smuzhiyunrate instead. 29*4882a593Smuzhiyun 30*4882a593SmuzhiyunIOPRIO_CLASS_BE: This is the best-effort scheduling class, which is the default 31*4882a593Smuzhiyunfor any process that hasn't set a specific io priority. The class data 32*4882a593Smuzhiyundetermines how much io bandwidth the process will get, it's directly mappable 33*4882a593Smuzhiyunto the cpu nice levels just more coarsely implemented. 0 is the highest 34*4882a593SmuzhiyunBE prio level, 7 is the lowest. The mapping between cpu nice level and io 35*4882a593Smuzhiyunnice level is determined as: io_nice = (cpu_nice + 20) / 5. 36*4882a593Smuzhiyun 37*4882a593SmuzhiyunIOPRIO_CLASS_IDLE: This is the idle scheduling class, processes running at this 38*4882a593Smuzhiyunlevel only get io time when no one else needs the disk. The idle class has no 39*4882a593Smuzhiyunclass data, since it doesn't really apply here. 40*4882a593Smuzhiyun 41*4882a593SmuzhiyunTools 42*4882a593Smuzhiyun----- 43*4882a593Smuzhiyun 44*4882a593SmuzhiyunSee below for a sample ionice tool. Usage:: 45*4882a593Smuzhiyun 46*4882a593Smuzhiyun # ionice -c<class> -n<level> -p<pid> 47*4882a593Smuzhiyun 48*4882a593SmuzhiyunIf pid isn't given, the current process is assumed. IO priority settings 49*4882a593Smuzhiyunare inherited on fork, so you can use ionice to start the process at a given 50*4882a593Smuzhiyunlevel:: 51*4882a593Smuzhiyun 52*4882a593Smuzhiyun # ionice -c2 -n0 /bin/ls 53*4882a593Smuzhiyun 54*4882a593Smuzhiyunwill run ls at the best-effort scheduling class at the highest priority. 55*4882a593SmuzhiyunFor a running process, you can give the pid instead:: 56*4882a593Smuzhiyun 57*4882a593Smuzhiyun # ionice -c1 -n2 -p100 58*4882a593Smuzhiyun 59*4882a593Smuzhiyunwill change pid 100 to run at the realtime scheduling class, at priority 2. 60*4882a593Smuzhiyun 61*4882a593Smuzhiyunionice.c tool:: 62*4882a593Smuzhiyun 63*4882a593Smuzhiyun #include <stdio.h> 64*4882a593Smuzhiyun #include <stdlib.h> 65*4882a593Smuzhiyun #include <errno.h> 66*4882a593Smuzhiyun #include <getopt.h> 67*4882a593Smuzhiyun #include <unistd.h> 68*4882a593Smuzhiyun #include <sys/ptrace.h> 69*4882a593Smuzhiyun #include <asm/unistd.h> 70*4882a593Smuzhiyun 71*4882a593Smuzhiyun extern int sys_ioprio_set(int, int, int); 72*4882a593Smuzhiyun extern int sys_ioprio_get(int, int); 73*4882a593Smuzhiyun 74*4882a593Smuzhiyun #if defined(__i386__) 75*4882a593Smuzhiyun #define __NR_ioprio_set 289 76*4882a593Smuzhiyun #define __NR_ioprio_get 290 77*4882a593Smuzhiyun #elif defined(__ppc__) 78*4882a593Smuzhiyun #define __NR_ioprio_set 273 79*4882a593Smuzhiyun #define __NR_ioprio_get 274 80*4882a593Smuzhiyun #elif defined(__x86_64__) 81*4882a593Smuzhiyun #define __NR_ioprio_set 251 82*4882a593Smuzhiyun #define __NR_ioprio_get 252 83*4882a593Smuzhiyun #elif defined(__ia64__) 84*4882a593Smuzhiyun #define __NR_ioprio_set 1274 85*4882a593Smuzhiyun #define __NR_ioprio_get 1275 86*4882a593Smuzhiyun #else 87*4882a593Smuzhiyun #error "Unsupported arch" 88*4882a593Smuzhiyun #endif 89*4882a593Smuzhiyun 90*4882a593Smuzhiyun static inline int ioprio_set(int which, int who, int ioprio) 91*4882a593Smuzhiyun { 92*4882a593Smuzhiyun return syscall(__NR_ioprio_set, which, who, ioprio); 93*4882a593Smuzhiyun } 94*4882a593Smuzhiyun 95*4882a593Smuzhiyun static inline int ioprio_get(int which, int who) 96*4882a593Smuzhiyun { 97*4882a593Smuzhiyun return syscall(__NR_ioprio_get, which, who); 98*4882a593Smuzhiyun } 99*4882a593Smuzhiyun 100*4882a593Smuzhiyun enum { 101*4882a593Smuzhiyun IOPRIO_CLASS_NONE, 102*4882a593Smuzhiyun IOPRIO_CLASS_RT, 103*4882a593Smuzhiyun IOPRIO_CLASS_BE, 104*4882a593Smuzhiyun IOPRIO_CLASS_IDLE, 105*4882a593Smuzhiyun }; 106*4882a593Smuzhiyun 107*4882a593Smuzhiyun enum { 108*4882a593Smuzhiyun IOPRIO_WHO_PROCESS = 1, 109*4882a593Smuzhiyun IOPRIO_WHO_PGRP, 110*4882a593Smuzhiyun IOPRIO_WHO_USER, 111*4882a593Smuzhiyun }; 112*4882a593Smuzhiyun 113*4882a593Smuzhiyun #define IOPRIO_CLASS_SHIFT 13 114*4882a593Smuzhiyun 115*4882a593Smuzhiyun const char *to_prio[] = { "none", "realtime", "best-effort", "idle", }; 116*4882a593Smuzhiyun 117*4882a593Smuzhiyun int main(int argc, char *argv[]) 118*4882a593Smuzhiyun { 119*4882a593Smuzhiyun int ioprio = 4, set = 0, ioprio_class = IOPRIO_CLASS_BE; 120*4882a593Smuzhiyun int c, pid = 0; 121*4882a593Smuzhiyun 122*4882a593Smuzhiyun while ((c = getopt(argc, argv, "+n:c:p:")) != EOF) { 123*4882a593Smuzhiyun switch (c) { 124*4882a593Smuzhiyun case 'n': 125*4882a593Smuzhiyun ioprio = strtol(optarg, NULL, 10); 126*4882a593Smuzhiyun set = 1; 127*4882a593Smuzhiyun break; 128*4882a593Smuzhiyun case 'c': 129*4882a593Smuzhiyun ioprio_class = strtol(optarg, NULL, 10); 130*4882a593Smuzhiyun set = 1; 131*4882a593Smuzhiyun break; 132*4882a593Smuzhiyun case 'p': 133*4882a593Smuzhiyun pid = strtol(optarg, NULL, 10); 134*4882a593Smuzhiyun break; 135*4882a593Smuzhiyun } 136*4882a593Smuzhiyun } 137*4882a593Smuzhiyun 138*4882a593Smuzhiyun switch (ioprio_class) { 139*4882a593Smuzhiyun case IOPRIO_CLASS_NONE: 140*4882a593Smuzhiyun ioprio_class = IOPRIO_CLASS_BE; 141*4882a593Smuzhiyun break; 142*4882a593Smuzhiyun case IOPRIO_CLASS_RT: 143*4882a593Smuzhiyun case IOPRIO_CLASS_BE: 144*4882a593Smuzhiyun break; 145*4882a593Smuzhiyun case IOPRIO_CLASS_IDLE: 146*4882a593Smuzhiyun ioprio = 7; 147*4882a593Smuzhiyun break; 148*4882a593Smuzhiyun default: 149*4882a593Smuzhiyun printf("bad prio class %d\n", ioprio_class); 150*4882a593Smuzhiyun return 1; 151*4882a593Smuzhiyun } 152*4882a593Smuzhiyun 153*4882a593Smuzhiyun if (!set) { 154*4882a593Smuzhiyun if (!pid && argv[optind]) 155*4882a593Smuzhiyun pid = strtol(argv[optind], NULL, 10); 156*4882a593Smuzhiyun 157*4882a593Smuzhiyun ioprio = ioprio_get(IOPRIO_WHO_PROCESS, pid); 158*4882a593Smuzhiyun 159*4882a593Smuzhiyun printf("pid=%d, %d\n", pid, ioprio); 160*4882a593Smuzhiyun 161*4882a593Smuzhiyun if (ioprio == -1) 162*4882a593Smuzhiyun perror("ioprio_get"); 163*4882a593Smuzhiyun else { 164*4882a593Smuzhiyun ioprio_class = ioprio >> IOPRIO_CLASS_SHIFT; 165*4882a593Smuzhiyun ioprio = ioprio & 0xff; 166*4882a593Smuzhiyun printf("%s: prio %d\n", to_prio[ioprio_class], ioprio); 167*4882a593Smuzhiyun } 168*4882a593Smuzhiyun } else { 169*4882a593Smuzhiyun if (ioprio_set(IOPRIO_WHO_PROCESS, pid, ioprio | ioprio_class << IOPRIO_CLASS_SHIFT) == -1) { 170*4882a593Smuzhiyun perror("ioprio_set"); 171*4882a593Smuzhiyun return 1; 172*4882a593Smuzhiyun } 173*4882a593Smuzhiyun 174*4882a593Smuzhiyun if (argv[optind]) 175*4882a593Smuzhiyun execvp(argv[optind], &argv[optind]); 176*4882a593Smuzhiyun } 177*4882a593Smuzhiyun 178*4882a593Smuzhiyun return 0; 179*4882a593Smuzhiyun } 180*4882a593Smuzhiyun 181*4882a593Smuzhiyun 182*4882a593SmuzhiyunMarch 11 2005, Jens Axboe <jens.axboe@oracle.com> 183