kernel源码学习-1

进程: 系统资源的分配单位, task_struct结构体实现,位于/include/linux/sched.h中

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
struct task_struct {
volatile long state;//-1代表就绪,0代表运行,>0代表停止
void *stack;//指向内核栈的指针
atomic_t usage;//有几个进程正在使用此结构
unsigned int flags;//标记 /* per process flags, defined below */
unsigned int ptrace;//跟踪调试标记

#ifdef CONFIG_SMP //多处理用的条件编译
struct llist_node wake_entry;
int on_cpu;
struct task_struct *last_wakee;
unsigned long wakee_flips;
unsigned long wakee_flip_decay_ts;

int wake_cpu;
#endif
//运行队列和进程调试相关
int on_rq;

int prio, static_prio, normal_prio;//调试相关
unsigned int rt_priority;//优先级
//关于进程
const struct sched_class *sched_class;
struct sched_entity se;
struct sched_rt_entity rt;
//结构体链表
#ifdef CONFIG_CGROUP_SCHED
struct task_group *sched_task_group;
#endif
struct sched_dl_entity dl;

#ifdef CONFIG_PREEMPT_NOTIFIERS
/* list of struct preempt_notifier: */
struct hlist_head preempt_notifiers;
#endif
//块设备I/O层的跟踪工具
#ifdef CONFIG_BLK_DEV_IO_TRACE
unsigned int btrace_seq;
#endif
//进程调试策略相关的字段
unsigned int policy;
int nr_cpus_allowed;
cpumask_t cpus_allowed;
//RCU同步原语
#ifdef CONFIG_PREEMPT_RCU
int rcu_read_lock_nesting;
union rcu_special rcu_read_unlock_special;
struct list_head rcu_node_entry;
#endif /* #ifdef CONFIG_PREEMPT_RCU */
#ifdef CONFIG_PREEMPT_RCU
struct rcu_node *rcu_blocked_node;
#endif /* #ifdef CONFIG_PREEMPT_RCU */
#ifdef CONFIG_TASKS_RCU
unsigned long rcu_tasks_nvcsw;
bool rcu_tasks_holdout;
struct list_head rcu_tasks_holdout_list;
int rcu_tasks_idle_cpu;
#endif /* #ifdef CONFIG_TASKS_RCU */

#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
struct sched_info sched_info;
#endif

struct list_head tasks;//进程架构链表
#ifdef CONFIG_SMP
struct plist_node pushable_tasks;
struct rb_node pushable_dl_tasks;
#endif

struct mm_struct *mm, *active_mm;//进程的地址空间
#ifdef CONFIG_COMPAT_BRK
unsigned brk_randomized:1;
#endif
/* per-thread vma caching */
u32 vmacache_seqnum;
struct vm_area_struct *vmacache[VMACACHE_SIZE];
#if defined(SPLIT_RSS_COUNTING)
struct task_rss_stat rss_stat;
#endif
/* task state */
//进程状态参数
int exit_state;
int exit_code, exit_signal;//进程退出发出的信号
int pdeath_signal; /* The signal sent when the parent dies */
unsigned int jobctl; /* JOBCTL_*, siglock protected */

/* Used for emulating ABI behavior of previous Linux versions */
unsigned int personality;

unsigned in_execve:1; /* Tell the LSMs that the process is doing an
* execve */
unsigned in_iowait:1;

/* Revert to default priority/policy when forking */
unsigned sched_reset_on_fork:1;
unsigned sched_contributes_to_load:1;

#ifdef CONFIG_MEMCG_KMEM
unsigned memcg_kmem_skip_account:1;
#endif

unsigned long atomic_flags; /* Flags needing atomic access. */

struct restart_block restart_block;

pid_t pid;//进程pid
pid_t tgid;//父进程tgid
//防止内核栈溢出
#ifdef CONFIG_CC_STACKPROTECTOR
/* Canary value for the -fstack-protector gcc feature */
unsigned long stack_canary;
#endif
/*
* pointers to (original) parent process, youngest child, younger sibling,
* older sibling, respectively. (p->father can be replaced with
* p->real_parent->pid)
*/
struct task_struct __rcu *real_parent; /* real parent process */
struct task_struct __rcu *parent; /* recipient of SIGCHLD, wait4() reports */
/*
* children/sibling forms the list of my natural children
*/
//子进程链表
struct list_head children; /* list of my children */
//兄弟链表
struct list_head sibling; /* linkage in my parent's children list */
//线程组组长
struct task_struct *group_leader; /* threadgroup leader */

/*
* ptraced is the list of tasks this task is using ptrace on.
* This includes both natural children and PTRACE_ATTACH targets.
* p->ptrace_entry is p's link on the p->parent->ptraced list.
*/
//系统调用 关于断开调试
struct list_head ptraced;
struct list_head ptrace_entry;

/* PID/PID hash table linkage. */
//PID/PID散列表的关系
struct pid_link pids[PIDTYPE_MAX];
struct list_head thread_group;
struct list_head thread_node;
//do_fork()函数
struct completion *vfork_done; /* for vfork() */
int __user *set_child_tid; /* CLONE_CHILD_SETTID */
int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */
//描述CPU时间的内容
//utime 用户态下的执行时间
//stime 内核态下的执行时间
cputime_t utime, stime, utimescaled, stimescaled;
cputime_t gtime;
#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
struct cputime prev_cputime;
#endif
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
seqlock_t vtime_seqlock;
unsigned long long vtime_snap;
enum {
VTIME_SLEEPING = 0,
VTIME_USER,
VTIME_SYS,
} vtime_snap_whence;
#endif
unsigned long nvcsw, nivcsw; /* context switch counts */
u64 start_time; /* monotonic time in nsec */
u64 real_start_time; /* boot based time in nsec */
/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
unsigned long min_flt, maj_flt;

struct task_cputime cputime_expires;
struct list_head cpu_timers[3];

/* process credentials */
const struct cred __rcu *real_cred; /* objective and real subjective task
* credentials (COW) */
const struct cred __rcu *cred; /* effective (overridable) subjective task
* credentials (COW) */
char comm[TASK_COMM_LEN]; /* executable name excluding path
- access with [gs]et_task_comm (which lock
it with task_lock())
- initialized normally by setup_new_exec */
/* file system info */
int link_count, total_link_count;
#ifdef CONFIG_SYSVIPC
/* ipc stuff */
struct sysv_sem sysvsem;
struct sysv_shm sysvshm;
#endif
#ifdef CONFIG_DETECT_HUNG_TASK
/* hung task detection */
unsigned long last_switch_count;
#endif
/* CPU-specific state of this task */
struct thread_struct thread;
/* filesystem information */
struct fs_struct *fs;
/* open file information */
struct files_struct *files;
/* namespaces */
struct nsproxy *nsproxy;
/* signal handlers */
struct signal_struct *signal;
struct sighand_struct *sighand;

sigset_t blocked, real_blocked;
sigset_t saved_sigmask; /* restored if set_restore_sigmask() was used */
struct sigpending pending;

unsigned long sas_ss_sp;
size_t sas_ss_size;
int (*notifier)(void *priv);
void *notifier_data;
sigset_t *notifier_mask;
struct callback_head *task_works;

struct audit_context *audit_context;
#ifdef CONFIG_AUDITSYSCALL
kuid_t loginuid;
unsigned int sessionid;
#endif
struct seccomp seccomp;

/* Thread group tracking */
u32 parent_exec_id;
u32 self_exec_id;
/* Protection of (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed,
* mempolicy */
spinlock_t alloc_lock;

/* Protection of the PI data structures: */
raw_spinlock_t pi_lock;

#ifdef CONFIG_RT_MUTEXES
/* PI waiters blocked on a rt_mutex held by this task */
struct rb_root pi_waiters;
struct rb_node *pi_waiters_leftmost;
/* Deadlock detection and priority inheritance handling */
struct rt_mutex_waiter *pi_blocked_on;
#endif

#ifdef CONFIG_DEBUG_MUTEXES
/* mutex deadlock detection */
struct mutex_waiter *blocked_on;
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
unsigned int irq_events;
unsigned long hardirq_enable_ip;
unsigned long hardirq_disable_ip;
unsigned int hardirq_enable_event;
unsigned int hardirq_disable_event;
int hardirqs_enabled;
int hardirq_context;
unsigned long softirq_disable_ip;
unsigned long softirq_enable_ip;
unsigned int softirq_disable_event;
unsigned int softirq_enable_event;
int softirqs_enabled;
int softirq_context;
#endif
#ifdef CONFIG_LOCKDEP
# define MAX_LOCK_DEPTH 48UL
u64 curr_chain_key;
int lockdep_depth;
unsigned int lockdep_recursion;
struct held_lock held_locks[MAX_LOCK_DEPTH];
gfp_t lockdep_reclaim_gfp;
#endif

/* journalling filesystem info */
//日志文件系统信息
void *journal_info;

/* stacked block device info */
//块设备链表
struct bio_list *bio_list;

#ifdef CONFIG_BLOCK
/* stack plugging */
struct blk_plug *plug;
#endif

/* VM state */
//虚拟内存状态参数
struct reclaim_state *reclaim_state;//虚拟内存状态,内存回收

struct backing_dev_info *backing_dev_info;//存放块设备I/O流量信息

struct io_context *io_context;//I/O调度器所用的信息

unsigned long ptrace_message;
siginfo_t *last_siginfo; /* For ptrace use. */
struct task_io_accounting ioac;//记录进程I/O计数
#if defined(CONFIG_TASK_XACCT)
u64 acct_rss_mem1; /* accumulated rss usage */
u64 acct_vm_mem1; /* accumulated virtual memory usage */
cputime_t acct_timexpd; /* stime + utime since last update */
#endif
#ifdef CONFIG_CPUSETS
nodemask_t mems_allowed; /* Protected by alloc_lock */
seqcount_t mems_allowed_seq; /* Seqence no to catch updates */
int cpuset_mem_spread_rotor;
int cpuset_slab_spread_rotor;
#endif
#ifdef CONFIG_CGROUPS
/* Control Group info protected by css_set_lock */
struct css_set __rcu *cgroups;
/* cg_list protected by css_set_lock and tsk->alloc_lock */
struct list_head cg_list;
#endif
#ifdef CONFIG_FUTEX
struct robust_list_head __user *robust_list;
#ifdef CONFIG_COMPAT
struct compat_robust_list_head __user *compat_robust_list;
#endif
struct list_head pi_state_list;
struct futex_pi_state *pi_state_cache;
#endif
//内存检测工具Performance
#ifdef CONFIG_PERF_EVENTS
struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts];
struct mutex perf_event_mutex;
struct list_head perf_event_list;
#endif
#ifdef CONFIG_DEBUG_PREEMPT
unsigned long preempt_disable_ip;
#endif
#ifdef CONFIG_NUMA
struct mempolicy *mempolicy; /* Protected by alloc_lock */
short il_next;
short pref_node_fork;
#endif
#ifdef CONFIG_NUMA_BALANCING
int numa_scan_seq;
unsigned int numa_scan_period;
unsigned int numa_scan_period_max;
int numa_preferred_nid;
unsigned long numa_migrate_retry;
u64 node_stamp; /* migration stamp */
u64 last_task_numa_placement;
u64 last_sum_exec_runtime;
struct callback_head numa_work;

struct list_head numa_entry;
struct numa_group *numa_group;

/*
* numa_faults is an array split into four regions:
* faults_memory, faults_cpu, faults_memory_buffer, faults_cpu_buffer
* in this precise order.
*
* faults_memory: Exponential decaying average of faults on a per-node
* basis. Scheduling placement decisions are made based on these
* counts. The values remain static for the duration of a PTE scan.
* faults_cpu: Track the nodes the process was running on when a NUMA
* hinting fault was incurred.
* faults_memory_buffer and faults_cpu_buffer: Record faults per node
* during the current scan window. When the scan completes, the counts
* in faults_memory and faults_cpu decay and these values are copied.
*/
unsigned long *numa_faults;
unsigned long total_numa_faults;

/*
* numa_faults_locality tracks if faults recorded during the last
* scan window were remote/local or failed to migrate. The task scan
* period is adapted based on the locality of the faults with different
* weights depending on whether they were shared or private faults
*/
unsigned long numa_faults_locality[3];

unsigned long numa_pages_migrated;
#endif /* CONFIG_NUMA_BALANCING */

struct rcu_head rcu;//rcu链表

/*
* cache last used pipe for splice
*/
struct pipe_inode_info *splice_pipe;//管道

struct page_frag task_frag;

#ifdef CONFIG_TASK_DELAY_ACCT
struct task_delay_info *delays;//延迟计数
#endif
#ifdef CONFIG_FAULT_INJECTION
int make_it_fail;
#endif
/*
* when (nr_dirtied >= nr_dirtied_pause), it's time to call
* balance_dirty_pages() for some dirty throttling pause
*/
int nr_dirtied;
int nr_dirtied_pause;
unsigned long dirty_paused_when; /* start of a write-and-pause period */

#ifdef CONFIG_LATENCYTOP
int latency_record_count;
struct latency_record latency_record[LT_SAVECOUNT];
#endif
/*
* time slack values; these are used to round up poll() and
* select() etc timeout values. These are in nanoseconds.
*/
unsigned long timer_slack_ns;
unsigned long default_timer_slack_ns;

#ifdef CONFIG_KASAN
unsigned int kasan_depth;
#endif
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
/* Index of current stored address in ret_stack */
int curr_ret_stack;
/* Stack of return addresses for return function tracing */
struct ftrace_ret_stack *ret_stack;
/* time stamp for last schedule */
unsigned long long ftrace_timestamp;
/*
* Number of functions that haven't been traced
* because of depth overrun.
*/
atomic_t trace_overrun;
/* Pause for the tracing */
atomic_t tracing_graph_pause;
#endif
#ifdef CONFIG_TRACING
/* state flags for use by tracers */
unsigned long trace;
/* bitmask and counter of trace recursion */
unsigned long trace_recursion;
#endif /* CONFIG_TRACING */
#ifdef CONFIG_MEMCG
struct memcg_oom_info {
struct mem_cgroup *memcg;
gfp_t gfp_mask;
int order;
unsigned int may_oom:1;
} memcg_oom;
#endif
#ifdef CONFIG_UPROBES
struct uprobe_task *utask;
#endif
#if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE)
unsigned int sequential_io;
unsigned int sequential_io_avg;
#endif
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
unsigned long task_state_change;
#endif
};

进程复制

    传统的UNIX中用于复制进程的系统调用是fork。但它并不是Linux为此实现的唯一调用,实际上Linux实现了3个。
(1 ) fork是重量级调用,因为它建立了父进程的一个完整副本,然后作为子进程执行。为减少与该调用相关的工作量,Linux使用了写时复制(copy-on-write)技术。
(2) vfork类似于fork,但并不创建父进程数据的副本。相反,父子进程之间共享数据。
这节省了大量CPU时间(如果一个进程操纵共享数据,则另一个会自动注意到)。
(3) clone产生线程,可以对父子进程之间的共享、复制进行精确控制。

写时复制(Copy On Write , COW)

fork时只进行page的复制但使用的是同一块物理内存,在要进行写时才进行拷贝使用不同的物理内存。

进程调度: sched_class,位于kernel/sched/sched.h

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
struct sched_class {
//系统中有多个调度类,按照调度优先级排一个链表
const struct sched_class *next;
//将进程加入到执行队列中,即将调度实体(进程)存放到红黑树中,并对nr_running+1
void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
//从执行队列中删除进程,并对nr_running-1
void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
//放弃CPU执行权,实际上该函数执行先出列再入列,放置于红黑树最右端
void (*yield_task) (struct rq *rq);
bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt);
//用于检查当前进程是否可被新进程抢占
void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);

/*
* It is the responsibility of the pick_next_task() method that will
* return the next task to call put_prev_task() on the @prev task or
* something equivalent.
*
* May return RETRY_TASK when it finds a higher prio class has runnable
* tasks.
*/
//选择下一个应该要运行的进程
struct task_struct * (*pick_next_task) (struct rq *rq,
struct task_struct *prev);
//将进程放回到运行列队中
void (*put_prev_task) (struct rq *rq, struct task_struct *p);

#ifdef CONFIG_SMP
//为进程选择一个合适的CPU
int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
//迁移任务到另一个CPU
void (*migrate_task_rq)(struct task_struct *p, int next_cpu);

void (*post_schedule) (struct rq *this_rq);
//用于唤醒进程
void (*task_waking) (struct task_struct *task);
void (*task_woken) (struct rq *this_rq, struct task_struct *task);
//修改进程再CPU的亲和力
void (*set_cpus_allowed)(struct task_struct *p,
const struct cpumask *newmask);
//启动运行队列
void (*rq_online)(struct rq *rq);
//禁止运行队列
void (*rq_offline)(struct rq *rq);
#endif
//当进程改变它的调度类或进程组时被调用
void (*set_curr_task) (struct rq *rq);
//调用字time tick函数,它可能引起进程切换,将驱动进行时(running)抢占
void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
//当进程创建的时候调用,不同调度的进程初始化策略也不一样
void (*task_fork) (struct task_struct *p);
//进程退出时使用
void (*task_dead) (struct task_struct *p);

/*
* The switched_from() call is allowed to drop rq->lock, therefore we
* cannot assume the switched_from/switched_to pair is serliazed by
* rq->lock. They are however serialized by p->pi_lock.
*/
//用于进程切换操作
void (*switched_from) (struct rq *this_rq, struct task_struct *task);
void (*switched_to) (struct rq *this_rq, struct task_struct *task);
//改变进程优先级
void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
int oldprio);

unsigned int (*get_rr_interval) (struct rq *rq,
struct task_struct *task);

void (*update_curr) (struct rq *rq);

#ifdef CONFIG_FAIR_GROUP_SCHED
void (*task_move_group) (struct task_struct *p, int on_rq);
#endif
};

Linux调度类

    调度类: dl_sched_class、rt sched_class、fair_sched_class及idle sched_class等。每个进程都有对应一种调度策略,每一种调度策略又对应一种调度类(每一个调度类可以对应多种调度策略)。

1
2
3
4
5
extern const struct sched_class stop_sched_class;
extern const struct sched_class dl_sched_class;
extern const struct sched_class rt_sched_class;
extern const struct sched_class fair_sched_class;
extern const struct sched_class idle_sched_class;

rt_sched_class类实时调度器(调度策略:SCHED_FIFO、SCHED_RR)
fair_sched_class类完全公平调度器(调度策略: SCHED_NORMAL、SCHED_BATCH)
SCHED_FIFO调度策略的实时进程永远比SCHED_NORMAL调度策略的普通进程优先运行。
调度类优先级顺序: stop_sched_class>dl_sched_class>rt_sched_class>fair_sched_class>idle__sched_class。

SCHED_NORMAL,SCHED_BATCH,SCHED_IDLE对应fair_sched_class

SCHED_RR,SCHED_FIFO对应rt_schedule_class

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
//优先级最高,会中断所有进程,并且不会被打断
const struct sched_class stop_sched_class = {
.next = &dl_sched_class,

.enqueue_task = enqueue_task_stop,
.dequeue_task = dequeue_task_stop,
.yield_task = yield_task_stop,

.check_preempt_curr = check_preempt_curr_stop,

.pick_next_task = pick_next_task_stop,
.put_prev_task = put_prev_task_stop,

#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_stop,
#endif

.set_curr_task = set_curr_task_stop,
.task_tick = task_tick_stop,

.get_rr_interval = get_rr_interval_stop,

.prio_changed = prio_changed_stop,
.switched_to = switched_to_stop,
.update_curr = update_curr_stop,
};
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
//
const struct sched_class dl_sched_class = {
.next = &rt_sched_class,
.enqueue_task = enqueue_task_dl,
.dequeue_task = dequeue_task_dl,
.yield_task = yield_task_dl,

.check_preempt_curr = check_preempt_curr_dl,

.pick_next_task = pick_next_task_dl,
.put_prev_task = put_prev_task_dl,

#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_dl,
.set_cpus_allowed = set_cpus_allowed_dl,
.rq_online = rq_online_dl,
.rq_offline = rq_offline_dl,
.post_schedule = post_schedule_dl,
.task_woken = task_woken_dl,
#endif

.set_curr_task = set_curr_task_dl,
.task_tick = task_tick_dl,
.task_fork = task_fork_dl,
.task_dead = task_dead_dl,

.prio_changed = prio_changed_dl,
.switched_from = switched_from_dl,
.switched_to = switched_to_dl,

.update_curr = update_curr_dl,
};
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
//作用于实时进程
const struct sched_class rt_sched_class = {
.next = &fair_sched_class,
.enqueue_task = enqueue_task_rt,//将一个task放入到就绪队列头部或尾部
.dequeue_task = dequeue_task_rt,//将一个task从就绪队列末尾删除
.yield_task = yield_task_rt,//主动放弃执行

.check_preempt_curr = check_preempt_curr_rt,

.pick_next_task = pick_next_task_rt,//核心调度器选择就绪队列哪个任务被调度
.put_prev_task = put_prev_task_rt,//当一个任务将要被调度时执行

#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_rt,//核心调度器给任务待定CPU,用于将任务分发到不同CPU上执行

.set_cpus_allowed = set_cpus_allowed_rt,
.rq_online = rq_online_rt,
.rq_offline = rq_offline_rt,
.post_schedule = post_schedule_rt,
.task_woken = task_woken_rt,
.switched_from = switched_from_rt,
#endif

.set_curr_task = set_curr_task_rt,//当任务修改其调度类或修改其他任务组时调用
.task_tick = task_tick_rt,//当时钟中断触发时被调用,主要更新进程运行统计信息是否需要调度

.get_rr_interval = get_rr_interval_rt,

.prio_changed = prio_changed_rt,
.switched_to = switched_to_rt,

.update_curr = update_curr_rt,
};
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
//公平调度器CFS,作用于常用线程
const struct sched_class fair_sched_class = {
.next = &idle_sched_class,
.enqueue_task = enqueue_task_fair,
.dequeue_task = dequeue_task_fair,
.yield_task = yield_task_fair,
.yield_to_task = yield_to_task_fair,

.check_preempt_curr = check_preempt_wakeup,

.pick_next_task = pick_next_task_fair,
.put_prev_task = put_prev_task_fair,

#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_fair,
.migrate_task_rq = migrate_task_rq_fair,

.rq_online = rq_online_fair,
.rq_offline = rq_offline_fair,

.task_waking = task_waking_fair,
#endif

.set_curr_task = set_curr_task_fair,
.task_tick = task_tick_fair,
.task_fork = task_fork_fair,

.prio_changed = prio_changed_fair,
.switched_from = switched_from_fair,
.switched_to = switched_to_fair,

.get_rr_interval = get_rr_interval_fair,

.update_curr = update_curr_fair,

#ifdef CONFIG_FAIR_GROUP_SCHED
.task_move_group = task_move_group_fair,
#endif
};
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
//每个CPU的第一个PID=0的线程,静态线程
const struct sched_class idle_sched_class = {
/* .next is NULL */
/* no enqueue/yield_task for idle tasks */

/* dequeue is not valid, we print a debug message there: */
.dequeue_task = dequeue_task_idle,

.check_preempt_curr = check_preempt_curr_idle,

.pick_next_task = pick_next_task_idle,
.put_prev_task = put_prev_task_idle,

#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_idle,
#endif

.set_curr_task = set_curr_task_idle,
.task_tick = task_tick_idle,

.get_rr_interval = get_rr_interval_idle,

.prio_changed = prio_changed_idle,
.switched_to = switched_to_idle,
.update_curr = update_curr_idle,
};

优先级

1
2
3
4
5
#define MAX_USER_RT_PRIO    100
#define MAX_RT_PRIO MAX_USER_RT_PRIO

#define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH)
#define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2)

进程优先级0-139,数字越小,优先级越高,0-99留给实时进程,100-139给普通进程

调度策略

1
2
3
4
5
6
7
8
9
10
/*
* Scheduling policies
*/
#define SCHED_NORMAL 0
#define SCHED_FIFO 1
#define SCHED_RR 2
#define SCHED_BATCH 3
/* SCHED_ISO: reserved but not implemented yet */
#define SCHED_IDLE 5
#define SCHED_DEADLINE 6

unsigned int policy—>保存进程的调度策略(有5种)︰

SCHED_NORMAL:用于普通进程,通过CFS调度器实现。
SCHED_BATCH:相当于SCHED_NORMAL分化版本,采用分时策略,根据动态优先级,分配CPU运行需要资源。
SCHED_IDLE:优先级最低,在系统空闲时才执行这类进程。
SCHED_FIFO:先进先出调度算法(实时调度策略),相同优先级任务先到先服务
高优先级的任务可以抢占低优
先级的任务。
SCHED_RR:轮流调度算法(实时调度策略)。
SCHED_DEADLINE:新支持实时进程调度策略,针对突发性计算。
SCHED_BATCH用于非交互处理器消耗型进程,SCHED_IDLE是在系统负载很低进使用CFS.

完全公平调度算法(时间片轮询调度算法)

参考:操作系统 时间片轮转调度算法_konsy_dong的博客-CSDN博客_时间片轮转调度

1、实际运行时间:

实际运行时间=调度周期*进程权重/所有进程权重之和

调度周期:指所有进程运行一遍所需要的时间;

进程权重:根据进程的重要性,分配给每个进程不同的权重;

调度周期为60ms,进程A的权重为1,而且进程B的权重为2,那么:进程A实际运行时间为: 60ms1/(1+2)=20ms 进程B实际运行时间为:60ms2/(1+2)=40ms*

2、虚拟运行时间:

虚拟运行时间=实际运行时间1024/进程权重=(调度周期进程权重/所有进程权利之和)1024/进程权重=调度周期1024/所有进程总权重。

就绪队列

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
struct cfs_rq {
//CFS运行队列中所有进程总负载
struct load_weight load;
//nr_running:cfs_rq中调度实体数
//h_nr_running:只对进程有效
unsigned int nr_running, h_nr_running;

u64 exec_clock;
//跟踪记录队列所有进程的最小虚拟运行时间
u64 min_vruntime;
#ifndef CONFIG_64BIT
u64 min_vruntime_copy;
#endif
//红黑树的root tasks_timeline用于在按时间排序的红黑树中管理所有进程
struct rb_root tasks_timeline;
//下一个调度结构(红黑树最左边结点)
struct rb_node *rb_leftmost;

/*
* 'curr' points to currently running entity on this cfs_rq.
* It is set to NULL otherwise (i.e when none are currently running).
*/
struct sched_entity *curr, *next, *last, *skip;

#ifdef CONFIG_SCHED_DEBUG
unsigned int nr_spread_over;
#endif

#ifdef CONFIG_SMP
/*
* CFS Load tracking
* Under CFS, load is tracked on a per-entity basis and aggregated up.
* This allows for the description of both thread and group usage (in
* the FAIR_GROUP_SCHED case).
*/
unsigned long runnable_load_avg, blocked_load_avg;
atomic64_t decay_counter;
u64 last_decay;
atomic_long_t removed_load;

#ifdef CONFIG_FAIR_GROUP_SCHED
/* Required to track per-cpu representation of a task_group */
u32 tg_runnable_contrib;
unsigned long tg_load_contrib;

/*
* h_load = weight * f(tg)
*
* Where f(tg) is the recursive weight fraction assigned to
* this group.
*/
unsigned long h_load;
u64 last_h_load_update;
struct sched_entity *h_load_next;
#endif /* CONFIG_FAIR_GROUP_SCHED */
#endif /* CONFIG_SMP */

#ifdef CONFIG_FAIR_GROUP_SCHED
struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */

/*
* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
* a hierarchy). Non-leaf lrqs hold other higher schedulable entities
* (like users, containers etc.)
*
* leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
* list is used during load balance.
*/
int on_list;
struct list_head leaf_cfs_rq_list;
//拥有该CFS运行队列的进程组
struct task_group *tg; /* group that "owns" this runqueue */

#ifdef CONFIG_CFS_BANDWIDTH
int runtime_enabled;
u64 runtime_expires;
s64 runtime_remaining;

u64 throttled_clock, throttled_clock_task;
u64 throttled_clock_task_time;
int throttled, throttle_count;
struct list_head throttled_list;
#endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */
};

实时调度实体,位于/include/linux/sched.h

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
struct sched_rt_entity {
struct list_head run_list;
unsigned long timeout;//主要用于判断当前进程时间是否超过RLIMIT_RTTIME
unsigned long watchdog_stamp;
unsigned int time_slice;//针对RR调度策略的调度时隙

struct sched_rt_entity *back;//dequeue_rt_stack作为临时变量
#ifdef CONFIG_RT_GROUP_SCHED
struct sched_rt_entity *parent;//指向上层调度实体
/* rq on which this entity is (to be) queued: */
struct rt_rq *rt_rq;//当前实时调度实体所在的就绪队列
/* rq "owned" by this entity/group: */
struct rt_rq *my_q;//当前实时调度实体的子调度实体所在就绪队列
#endif
};

实时调度核心操作(/kernel/sched/rt.c)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
//更新调度信息,将调度实体插入到相应优先级队列的末尾
static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
{
struct sched_rt_entity *rt_se = &p->rt;

if (flags & ENQUEUE_WAKEUP)
rt_se->timeout = 0;
//将当前实时调度实体添加到对应优先级链表上,尾部还是头部取决于flag是否包含ENQUEUE_HEAD
enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);

if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);
}

static struct task_struct *_pick_next_task_rt(struct rq *rq)
{
struct sched_rt_entity *rt_se;
struct task_struct *p;
struct rt_rq *rt_rq = &rq->rt;

do {//便利组调度中的每一个进程
rt_se = pick_next_rt_entity(rq, rt_rq);
BUG_ON(!rt_se);
rt_rq = group_rt_rq(rt_se);
} while (rt_rq);

p = rt_task_of(rt_se);
//更新我们的执行域
p->se.exec_start = rq_clock_task(rq);

return p;
}

static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
struct rt_rq *rt_rq)
{
struct rt_prio_array *array = &rt_rq->active;
struct sched_rt_entity *next = NULL;
struct list_head *queue;
int idx;
//找到一个可用实体
idx = sched_find_first_bit(array->bitmap);
BUG_ON(idx >= MAX_RT_PRIO);
//从链表组中找到对应的链表
queue = array->queue + idx;
next = list_entry(queue->next, struct sched_rt_entity, run_list);
//返回找到运行实体
return next;
}

static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
{
struct sched_rt_entity *rt_se = &p->rt ;
//更新调度信息
update_curr_rt(rq);
//将rt_se从运行队列中删除,放于队尾
dequeue_rt_entity(rt_se);
//从hash表中删除
dequeue_pushable_task(rq, p);
}

static void update_curr_rt(struct rq *rq)
{
struct task_struct *curr = rq->curr;
struct sched_rt_entity *rt_se = &curr->rt;
u64 delta_exec;
//判断是否有实时调度进程
if (curr->sched_class != &rt_sched_class)
return;
//执行时间
delta_exec = rq_clock_task(rq) - curr->se.exec_start;
if (unlikely((s64)delta_exec <= 0))
return;

schedstat_set(curr->se.statistics.exec_max,
max(curr->se.statistics.exec_max, delta_exec));
//更新当前进程总的执行时间
curr->se.sum_exec_runtime += delta_exec;
account_group_exec_runtime(curr, delta_exec);

curr->se.exec_start = rq_clock_task(rq);
cpuacct_charge(curr, delta_exec);

sched_rt_avg_update(rq, delta_exec);

if (!rt_bandwidth_enabled())
return;

for_each_sched_rt_entity(rt_se) {
struct rt_rq *rt_rq = rt_rq_of_se(rt_se);

if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
raw_spin_lock(&rt_rq->rt_runtime_lock);
rt_rq->rt_time += delta_exec;
if (sched_rt_runtime_exceeded(rt_rq))
resched_curr(rq);
raw_spin_unlock(&rt_rq->rt_runtime_lock);
}
}
}
static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
{
struct rq *rq = rq_of_rt_se(rt_se);

dequeue_rt_stack(rt_//从运行队列中删除se);

for_each_sched_rt_entity(rt_se) {
struct rt_rq *rt_rq = group_rt_rq(rt_se);

if (rt_rq && rt_rq->rt_nr_running)
__enqueue_rt_entity(rt_se, false);//添加到队尾
}
enqueue_top_rt_rq(&rq->rt);
}

设备与获取优先级主要函数

1
2
3
4
5
6
7
8
9
10
11
int pthread_attr_setschedparam(pthread_attr_t *attr,const struct sched_param *param);//创建线程优先级
int pthread_attr_getschedparam(pthread_attr t *attr,const struct sched_param *param);//获取线程优先级

param.sched_priority=51;设置优先级
//当系统创建线程时,默认线程是SCHED_OTHER。改变调度策略,通过如下函数:
int pthread_attr_setschedpolicy(pthread_attr_t *attr,int policty);
//设置线程调度策略
struct sched_param
{
int _sched_priority; //设定的线程优先级
}