Programming Linux Modules

Kernel Module

Implemented modules

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
//select_and_show.c
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/moduleparam.h>
#include <linux/sched/signal.h>

static unsigned int pid;

module_param(pid, uint, 0644);

static int select_and_show_init(void) {
struct task_struct * p;
struct task_struct * parent;
struct task_struct * children;
struct task_struct * sibling;
struct list_head * list;

printk("Begin\n");
printk("Relationship\tName \tPID \tStat \tPrio \n");
p = pid_task(find_vpid(pid), PIDTYPE_PID);
printk("Self \t%-20s\t%-6d\t%-6ld\t%-6d\n", p->comm, p->pid, p->state, p->prio);

parent = p->parent;
printk("Parent \t%-20s\t%-6d\t%-6ld\t%-6d\n", parent->comm, parent->pid, parent->state, parent->prio);

list = &parent->children;
list_for_each(list, &parent->children) {
sibling = list_entry(list, struct task_struct, sibling);
printk("Brother \t%-20s\t%-6d\t%-6ld\t%-6d\n", sibling->comm, sibling->pid, sibling->state, sibling->prio);
}

list = &p->children;
list_for_each(list, &p->children) {
children = list_entry(list, struct task_struct, sibling);
printk("Children \t%-20s\t%-6d\t%-6ld\t%-6d\n", children->comm, children->pid, children->state, children->prio);
}
return 0;
}

static void select_and_show_exit(void) {
printk(KERN_ALERT"goodbye~\n");
}

module_init(select_and_show_init);
module_exit(select_and_show_exit);

MODULE_LICENSE("GPL");
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
//show_all.c
#include <linux/init.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/init_task.h>

static int show_all_init(void) {
struct task_struct *p;
p = &init_task;
printk(KERN_ALERT"Name \tPid\tStat\tPrio\tParent\t");
for_each_process(p) {
printk(KERN_ALERT"%-20s\t%d\t%ld\t%d\t%d\n", p->comm, p->pid, p->state, p->prio, p->parent->pid);
}
return 0;
}

static void show_all_exit(void) {
printk(KERN_ALERT"goodbye~\n"); //KERN_ALERT
}

module_init(show_all_init);
module_exit(show_all_exit);

MODULE_LICENSE("GPL");
1
2
3
4
5
6
7
8
obj-m := show_all.o
show_all-objs:=lab2_1.o
KDIR := /home/test/test_kernel/linux-5.3.2
PWD :=$(shell pwd)
default:
make -C $(KDIR) M=$(PWD) modules
clean:
make -C $(KDIR) M=$(PWD) clean

License statement for modules

MODULE_LICENSE("GPL");

1
2
3
4
5
6
7
MODULE_LICENSE(_license)   // _license is the license name string
//"GPL"    [GNU Public License v2 or later]
//"GPL v2"     [GNU Public License v2]
//"GPL and additional rights"     [GNU Public License v2 rights and more]
//"Dual BSD/GPL" [GNU Public License v2 or BSD license choice]
//"Dual MIT/GPL" [GNU Public License v2 or MIT license choice]
//"Dual MPL/GPL" [GNU Public License v2 or Mozilla license choice]

From kernel version 2.4.10 on, modules must declare the license of this module via the MODULE_LICENSE macro, otherwise you will receive a warning that the kernel is contaminated with “kernel tainted” when loading this module. As we can see from the linux/module.h file, the meaningful licenses accepted by the kernel are “GPL”, “GPL v2”, “GPL and additional rights”, “Dual BSD/GPL”, “Dual MPL/GPL”, “Proprietary “.

module_init (TODO)

Find the include\linux\init.h file in the kernel source code directory

1
#define module_init(x) __initcall(x);

If this is a macro definition, then what is __initcall(x)?

1
2
3
#define __define_initcall(fn, id) \  
static initcall_t __initcall_##fn##id __used \
__attribute__((__section__(".initcall" #id ".init"))) = fn

initcalls

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
#define pure_initcall(fn)       __define_initcall(fn, 0)  

#define core_initcall(fn) __define_initcall(fn, 1)
#define core_initcall_sync(fn) __define_initcall(fn, 1s)
#define postcore_initcall(fn) __define_initcall(fn, 2)
#define postcore_initcall_sync(fn) __define_initcall(fn, 2s)
#define arch_initcall(fn) __define_initcall(fn, 3)
#define arch_initcall_sync(fn) __define_initcall(fn, 3s)
#define subsys_initcall(fn) __define_initcall(fn, 4)
#define subsys_initcall_sync(fn) __define_initcall(fn, 4s)
#define fs_initcall(fn) __define_initcall(fn, 5)
#define fs_initcall_sync(fn) __define_initcall(fn, 5s)
#define rootfs_initcall(fn) __define_initcall(fn, rootfs)
#define device_initcall(fn) __define_initcall(fn, 6)
#define device_initcall_sync(fn) __define_initcall(fn, 6s)
#define late_initcall(fn) __define_initcall(fn, 7)
#define late_initcall_sync(fn) __define_initcall(fn, 7s)

#define __initcall(fn) device_initcall(fn)

We can see very many xxx_initcall macro function definitions, they are all implemented by __define_initcall. Inside __define_initcall there are two parameters, one is fn and the other is id.

The function do_initcalls can be found in the init\main.c file

1
2
3
4
5
6
7
static void __init do_initcalls(void)  
{
int level;

for (level = 0; level < ARRAY_SIZE(initcall_levels) - 1; level++)
do_initcall_level(level);
}

do_initcalls seems to be mainly a for loop, which is executing some functions by level.

So the question arises, what is level and what function is executed, but this goes back to the above macro definition, first a simple wave of macro definition process

module_init(fn)---> __initcall(fn) ---> device_initcall(fn) ---> __define_initcall(fn, 6)

1
2
3
#define __define_initcall(fn, id) \  
static initcall_t __initcall_##fn##id __used \
__attribute__((__section__(".initcall" #id ".init"))) = fn

In the macro definition above, ## can mean a connection, and __initcall_##fn##id is __initcall_fnid

When fn is helloworld and id is 4, __initcall_##fn##id is __initcall_helloworld4

A single # symbol can be stringified, and #id for "id"

TODO…


Parameters of printk

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
// Emergency event message, prompted before a system crash, indicating that the system is unavailable
#define KERN_EMERG 0

// Report message that immediate action must be taken
#define KERN_ALERT 1

// Critical conditions, usually involving severe hardware or software operational failures
#define KERN_CRIT 2

// Error conditions, drivers often use KERN_ERR to report hardware errors
#define KERN_ERR 3

// Warning conditions for possible problem situations
#define KERN_WARNING 4

// Normal but important conditions for reminding
#define KERN_NOTICE 5

// Prompt messages, such as printing hardware information when the driver starts
#define KERN_INFO 6

// Debug level messages
#define KERN_DEBUG 7

task_struct

Status of the process

1
volatile long state;    /* -1为不可运行,0可以运行,大于0表示停止 */

​The process in Linux consists of multiple states, and during operation, the process will switch in multiple situations with scheduling, and the information of the process is the basis for the process to make scheduling swaps

State Meaning
TASK_RUNNING Runnable
TASK_INTERRUPTIBLE Waiting
TASK_UNINTERRUPTIBLE Uninterruptible waiting
TASK_ZOMBIE Zombie
TASK_STOPPED Pause
TASK_SWAPPING Switching in/out

Flags of the process

1
unsigned int flags; /* per process flags, defined below */

​Used by the kernel to identify the state of the current process for the next operation

Flag Meaning
PF_FORKNOEXEC The process has just been created and has not yet been executed
PF_SUPERPRIV Super User Privileges
PF_DUMPCORE Catching of exceptions
PF_SIGNALED Process killed by signal
PF_EXIRING The process begins to close

Identifier of the process

1
2
pid_t pid;     //Identifier of the process
pid_t tgid; //Group number of the thread

Relatives between processes

1
2
3
4
5
6
7
8
struct task_struct *real_parent; /* real parent process */
struct task_struct *parent; /* recipient of SIGCHLD, wait4() reports */
/*
* children/sibling forms the list of my natural children
*/
struct list_head children; /* list of my children */
struct list_head sibling; /* linkage in my parent's children list */
struct task_struct *group_leader; /* threadgroup leader */

​ Processes are created with an inheritance relationship; a process can create multiple child processes, which are the parents of these child processes, and these child processes have a sibling relationship with each other.

​ When creating a child process, the child process inherits most of the information from the parent process, which means that the child process copies most of the information from the task_struct structure of the parent process, except for the pid, and thus the system needs to record these relatives in order to collaborate between processes.

​ The task_struct structure of each process contains a number of pointers that connect the task _struct structures of all the processes to form a process tree.

Relatives Meaning
real_parent real parent
parent parent process
children The head of the chain table, all elements of the chain table are its child processes
sibling Insert the current process into the sibling chain
group_leader Points to the first entry in its process group|

ptrace system call

1
unsigned int ptrace;

​ The ptrace system call provides the ability for the parent process to observe and control the execution of the child process, and allows the parent process to check and replace the values of the child process’ kernel image (including registers).

​ Basic principle: When ptrace tracing is used, all signals sent to the traced child process are forwarded to the parent process, which is blocked. And after the parent process receives the signal, it can check and modify the stopped child process, and then let the child process continue to run. Please our common debugging tool gdb is based on ptrace to implement it.

Scheduling information of the process

1
2
3
const struct sched_class *sched_class;
struct sched_entity se;
struct sched_rt_entity rt;

sched_class: Scheduling Class

se: Calling entities for common processes, each process has one of these entities

rt: Real-time process call entities, each process has one of these entities

​Process scheduling uses this information to determine a limited order of process execution, combined with process state information to ensure that processes run in a reasonable and orderly manner. Processes have various scheduling information, as follows.

Name Meaning Usage
SCHED_OTHER Other scheduling methods Normal process
SCHED_FIFO First in first out Real-time processes
SCHED_RR Round-Robin Real-time processes

Priority of the process

1
2
int prio, static_prio, normal_prio;
unsigned int rt_priority;
Name Priority
prio Dynamic Priority
static_prio Static Priority
normal_prio Normal Priority
rt_prio Real-time Priority
  • The value of prio is the final priority value used by the scheduler, i.e., the value actually chosen by the scheduler when selecting a process. The prio smaller, the process’s priority higher. prio values range from 0 to MAX_PRIO, i.e., 0 to 139 (including 0 and 139), and can be divided into two intervals depending on the scheduling strategy, where the interval 0 to 99 is for real-time processes and non-real-time processes in the range of 100~139.
  • static_prio static priority will not change over time, the kernel will not actively modify it, but only through the system call nice to modify static_prio, and the static priority calculation formula is static_prio = MAX_RT_PRIO + nice +20. The value of MAX_RT_PRIO is 100, and the range of nice range is -20 to +19, so the static_prio value ranges from 100 to 139. The smaller the value of static_prio, the higher the static priority of the process.
  • The value of normal_prio depends on the static priority and scheduling policy and can be set by the _setscheduler function. For non-real-time processes, the value of normal_prio is equal to the static priority value static_prio; for real-time processes, normal_prio = MAX_RT_PRIO-1 - p->rt_priority.
  • The rt_priority value ranges from 0 ~ 99 and is only valid for real-time processes. From the equation: prio = MAX_RT_PRIO-1 - p->rt_priority; it can be seen that the larger the value of rt_priority, the smaller the value of prio, so the larger the value of real time priority (rt_priority) means the higher the priority of the process.

Time data information

1
2
3
4
5
6
7
8
9
10
11
cputime_t utime, stime, utimescaled, stimescaled;
cputime_t gtime;
cputime_t prev_utime, prev_stime;
unsigned long nvcsw, nivcsw; /* context switch counts */
struct timespec start_time; /* monotonic time */
struct timespec real_start_time; /* boot based time */
/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
unsigned long min_flt, maj_flt;

struct task_cputime cputime_expires;
struct list_head cpu_timers[3];
Name Meaning
utime/stime Record the timers passed by the process in user/kernel state
utimescaled/stimescaled Record the runtime of the process in user/kernel state
gtime Virtual machine time counted in beats
prev_utime/prev_stime Previous running time
nvcsw/nivcsw Voluntary/Involuntary Context Switching Count
start_time/real_start_time Process creation time / the latter includes sleep time|
cputime_expires Count the processor time of a process or process group being tracked|
|min_flt, maj_flt Missing page statistics|

Communication between processes

1
2
3
4
#ifdef CONFIG_SYSVIPC
/* ipc stuff */
struct sysv_sem sysvsem;
#endif

​If multiple processes are performing collaboration on a task, then it is necessary that these incoming processes can access each other’s resources and communicate with each other.
The main process communication methods in Linux are:

  • pipes
  • semaphores
  • shared memory
  • signals
  • message queues

File Information

1
2
3
4
5
6
/* file system info */  
int link_count, total_link_count;
/* filesystem information */
struct fs_struct *fs;
/* open file information */
struct files_struct *files;
define Meaning
struct fs_struct *fs Processes can be executed on the system where they affect
struct files_struct *files Files opened by the process

Processes can open or close files, which are system resources, and the Linux kernel has to keep a record of how the process uses the files.

There are two data structures in the task_struct structure to describe the information related to the process pre-file.

The fs _struct describes two VFS index nodes, called root and pwd, which point to the root and current or working directories corresponding to the process’s executable impact, respectively.

The file _struct structure is used to record the descriptors of the files opened by the process.

Signal processing information

1
2
3
4
5
6
7
8
9
10
11
struct signal_struct *signal;
struct sighand_struct *sighand;
sigset_t blocked, real_blocked;
sigset_t saved_sigmask; /* restored if set_restore_sigmask() was used */
struct sigpending pending;

unsigned long sas_ss_sp;
size_t sas_ss_size;
int (*notifier)(void *priv);
void *notifier_data;
sigset_t *notifier_mask;
name Meaning
signal Signal descriptor pointing to the process
sighand Signal handler descriptor pointing to the process
blocked Indicates the mask of the blocked signal, real_blocked indicates a temporary mask
pending Data structure for storing private pending signals
saa_ss_sp Alternate stack address for signal handlers, ass_ss_size indicates the stack size
notifier_data/notifier_mask The device driver uses the function pointed to by the notifier to block certain semaphores of the process. notifier_data is the data that may be used by the function pointed to by the notifier

虚拟内存处理

1
struct mm_struct *mm, *active_mm;
define Meaning
struct mm_struct *mm Describe the address space of the process
struct mm_struct *activa_mm Address space borrowed by kernel threads

mm_struct is used to describe the address space (virtual space) of each process. active_mm is introduced for kernel threads, because kernel threads do not have their own address space. In order to make kernel threads have a uniform context switch with ordinary processes, when a kernel thread makes a context switch, let the active_mm of the switched-in thread point to the active_mm of the process that has just been dispatched out.

Page management information

When there is not enough physical memory, the Linux memory management system needs to transfer some pages from memory to external memory, and the swap is done on a page-by-page basis.

define Meaning
int swappable Whether the memory pages occupied by the process can be swapped out
unsigned long min_flat, maj_flt, nswap The accumulated number of missing pages, the master count and the accumulated number of pages swapped out and in of the process
unsigned long cmin_flat, cnswap Cumulative number of sub-page misses, pages swapped in, for this process as an ancestor process, for all its hierarchical child processes|

Process Queue Pointer

  1. struct task_struct *next_task, *prev_task; // All processes (in the form of PCBs) form a two-way chain. next_task and prev_task are the front and back pointers to the chain. The head and tail of the chain are init_task (i.e. process 0).

  2. struct task_struct *next_run, *prev_run; // The run_queue is a two-way circular chain of processes that are running or can be run with the process status TASK_RUNNING. The front and back pointers of the chain are next_run and prev_run, and the head and tail of the chain are both init_task (i.e. process 0).

  3. struct task_struct *p_opptr, *p_pptr;struct task_struct *p_cptr, *p_ysptr, *p_osptr; // The above are pointers to the original parent, parent, youngest child, and newer and older sibling processes respectively.

TODO…

init_task

init_task is the first process of the kernel, process number 0, which becomes idle process when the initialization of the kernel is completed

init_task is a task_struct prototype for all processes and threads in the kernel. During kernel initialization, a task_struct interface is constructed by static definition, named init_task, and then a new kernel init thread, kthreadd kernel thread, is created by the rest_init() function later in the kernel initialization

  • The kernel init thread, which eventually executes the /sbin/init process, becomes the root process of all user state programs (as shown by the pstree command), i.e. the user space init process

    The first init is a kernel thread created by kthread_thread, which, after initialization, moves to user space and generates the ancestors of all user processes

  • kernel kthreadd kernel thread, becomes the parent of all other daemon threads in the kernel state.

    Its task is to manage and schedule other kernel threads kernel_thread, which loops through a kthread function that runs the kthreads maintained in the kthread_create_list global chain, and the kernel threads created when we call kernel_thread are added to this chain, so all kernel threads are directly or indirectly parented to kthreadd

The kernel will use the init_task as its task_struct structure descriptor, and when the system has nothing else to do, it will schedule its execution. At this point, the kernel will become an idle process, giving up the CPU and putting itself to sleep in a continuous loop.

Initialization of the stack

The process init_task is defined in init/init_task.c

1
2
3
/* Initial task structure */                                                       
struct task_struct init_task = INIT_TASK(init_task);
EXPORT_SYMBOL(init_task);

The macro for INIT_TASK is defined in include/linux/init_task.h

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
#define INIT_TASK(tsk)  \                                                       
{ \
.state = 0, \
.stack = &init_thread_info, \
.usage = ATOMIC_INIT(2), \
.flags = PF_KTHREAD, \
.prio = MAX_PRIO-20, \
.static_prio = MAX_PRIO-20, \
.normal_prio = MAX_PRIO-20, \
.policy = SCHED_NORMAL, \
.cpus_allowed = CPU_MASK_ALL, \
.nr_cpus_allowed= NR_CPUS, \
.mm = NULL, \
.active_mm = &init_mm, \
.restart_block = { \
.......
}

We can see that the stack of the init_task process is pointing to the init_thread_info

In the file arch/arm/include/asm/thread_info.h, init_thread_info is defined as follows

1
#define init_thread_info        (init_thread_union.thread_info) 

init_thread_info is a member of thread_info of init_thread_union

Variable init_thread_info is defined in init/init_task.c.

1
2
3
4
5
6
7
8
9
10
11
12
union thread_union init_thread_union __init_task_data =                            
{ INIT_THREAD_INFO(init_task) };

#define INIT_THREAD_INFO(tsk) \
{ \
.task = &tsk, \
.flags = 0, \
.preempt_count = INIT_PREEMPT_COUNT, \
.addr_limit = KERNEL_DS, \
}

#define __init_task_data __attribute__((__section__(".data..init_task")))
  1. Declares the init_thread_union variable of type thread_union, then assigns values to the thread_info member of init_thread_union, mainly init_thread_union.thread_info thread_info. task=&init_task, pointing the task member of this variable to init_task.
  2. attribute((section(".data...init_task")), specifying that the section name is .data...init_task, will be compiled into vmlinux at the beginning of .data.

Stack compilation into vmlinux (TODO)

1
2
3
4
5
6
7
8
9
10
11
12
243         .data : AT(__data_loc) {                                                
244 _data = .; /* address in memory */
245 _sdata = .;
246
247 /*
248 * first, the init task union, aligned
249 * to an 8192 byte boundary.
250 */
251 INIT_TASK_DATA(THREAD_SIZE)
======>
. = ALIGN(8192); \
*(.data..init_task)

TODO

for_each_process(p)

1
#define for_each_process(p)  for (p = &init_task ; (p = next_task(p)) != &init_task;)

Start with init_task and iterate through all processes

Linux interlinks the task structures of all processes into a circular bidirectional chain, like (&init_task)->next ! = &init_task goes on and on

pid_task()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
struct task_struct *pid_task(struct pid *pid, enum pid_type type)
{
struct task_struct *result = NULL;
// First determine the pid can not be null, pid for null, pid_task return value is also null
if (pid) {
struct hlist_node *first;
// The hlist_node *first can be found by the form parameter pid->tasks[type].
first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]),
lockdep_tasklist_lock_is_held());

if (first)
result = hlist_entry(first, struct task_struct, pids[(type)].node);
}
return result;
}

module_param()

For the explanation of the three parameters of module_param.
module_param(worldNum,int,0644);
The first parameter is the name of the parameter, defined by yourself
The second parameter is the type of the variable, such as int, long, char, float, etc.
The third parameter is the permission, similar to the permission of a file. Here it should mean which users can modify the meaning of this parameter.

1
2
* @perm is 0 if the the variable is not to appear in sysfs, or 0444
* for world-readable, 0644 for root-writable, etc.

Translated with www.DeepL.com/Translator (free version)

list_for_each()

1
2
3
4
5
6
7
8
/**  
 * list_for_each    -   iterate over a list  
 * @pos:    the &struct list_head to use as a loop counter.  
 * @head:   the head for your list.  
 */  
#define list_for_each(pos, head) \  
    for (pos = (head)->next, prefetch(pos->next); pos != (head); \  
        pos = pos->next, prefetch(pos->next))

list_entry()

1
2
3
4
5
6
7
8
9
10
#define list_entry(ptr, type, member) /
container_of(ptr, type, member)

#define container_of(ptr, type, member) /
({ /
const typeof( ((type *)0)->member ) *__mptr = (ptr);/
(type *)( (char *)__mptr - offsetof(type,member) ); /
})

#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER)
Author

ACce1er4t0r

Posted on

2022-03-09

Updated on

2023-04-22

Licensed under