You and Your Research How To Become A Hacker 编程的智慧 An open letter to those who want to start programming Teach Yourself Programming in Ten Years

ELF 映像的装入

GNU 把对于动态连接 ELF 映像的支持作了分工：把 EL F映像的装入/启动放在 Linux 内核中；而把动态连接的实现放在用户空间，并为此提供一个称为“解释器”的工具软件，而解释器的装入/启动也由内核负责。

系統呼叫

Classification and Grouping of Linux System Calls
Anomaly Detection Based on System Call Classification

Generally, systems provide a library or API that sits between normal programs and the operating system. On Unix-like systems, that API is usually part of an implementation of the C library (libc), such as glibc, that provides wrapper functions for the system calls, often named the same as the system calls that they call. On Windows NT, that API is part of the Native API, in the ntdll.dll library; this is an undocumented API used by implementations of the regular Windows API and directly used by some system programs on Windows.

System call

syscall_table_32.S 裡定義系統呼叫函式指針的集合。

ENTRY(sys_call_table)
  .long sys_restart_syscall /* 0 - old "setup()" system call, used for restarting */
  .long sys_exit
  .long ptregs_fork
  .long sys_read
  .long sys_write
  /* 略 */

entry_32.S 定義系統呼叫的入口匯編 (handler)。與 FreeBSD 不同，FreeBSD 是以棧傳遞參數，Linux 則是用暫存器傳參。

  # system call handler stub
ENTRY(system_call)
  RING0_INT_FRAME     # can't unwind into user space anyway
  pushl %eax      # save orig_eax
  CFI_ADJUST_CFA_OFFSET 4
  SAVE_ALL            ; 透過暫存器傳參。
  GET_THREAD_INFO(%ebp)
          # system call tracing in operation / emulation
  testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
  jnz syscall_trace_entry
  cmpl $(nr_syscalls), %eax
  jae syscall_badsys
syscall_call:
  call *sys_call_table(,%eax,4)
  movl %eax,PT_EAX(%esp)    # store the return value
syscall_exit:
  LOCKDEP_SYS_EXIT
  DISABLE_INTERRUPTS(CLBR_ANY)  # make sure we don't miss an interrupt
          # setting need_resched or sigpending
          # between sampling and the iret
  TRACE_IRQS_OFF
  movl TI_flags(%ebp), %ecx
  testl $_TIF_ALLWORK_MASK, %ecx  # current->work
  jne syscall_exit_work

透過 sys_call_table 跳轉至相應的函式。

SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
{
  struct file *file;
  ssize_t ret = -EBADF;
  int fput_needed;
 
  file = fget_light(fd, &fput_needed);
  if (file) {
    loff_t pos = file_pos_read(file);
    ret = vfs_read(file, buf, count, &pos);
    file_pos_write(file, pos);
    fput_light(file, fput_needed);
  }
 
  return ret;
}

Kernel command using Linux system calls

Linux Kernel 排程機制介紹

當時鐘發出中斷時，會調用 timer_interrupt 處理該中斷。

Native POSIX Thread Library 0.1 released

Embedded Linux 專欄

添加系統呼叫

以下以 3.5 版為例。

編譯內核。

$ wget http://www.kernel.org/pub/linux/kernel/v3.0/linux-3.5.3.tar.bz2
$ tar xvf linux-3.5.3.tar.bz2; cd linux-3.5.3
$ wget http://people.cs.nctu.edu.tw/~chenwj/source/config-qemu-x86
$ mv config-qemu-x86 .config
$ make ARCH=i386
$ qemu-system-i386 -kernel arch/x86/boot/bzImage -hda disk-x86.raw -append "root=/dev/sda" -vnc :3

編輯 arch/x86/syscalls/syscall_32.tbl。

diff -ruN linux-3.5.3/arch/x86/syscalls/syscall_32.tbl linux-3.5.3.new/arch/x86/syscalls/syscall_32.tbl
--- linux-3.5.3/arch/x86/syscalls/syscall_32.tbl        2012-08-26 10:32:13.000000000 +0800
+++ linux-3.5.3.new/arch/x86/syscalls/syscall_32.tbl    2012-08-28 14:17:16.098536453 +0800
@@ -356,3 +356,4 @@
 347    i386    process_vm_readv        sys_process_vm_readv            compat_sys_process_vm_readv
 348    i386    process_vm_writev       sys_process_vm_writev           compat_sys_process_vm_writev
 349    i386    kcmp                    sys_kcmp
+350    i386    helloworld              sys_helloworld

編輯 include/linux/syscalls.h。

diff -ruN linux-3.5.3/include/linux/syscalls.h linux-3.5.3.new/include/linux/syscalls.h
--- linux-3.5.3/include/linux/syscalls.h        2012-08-26 10:32:13.000000000 +0800
+++ linux-3.5.3.new/include/linux/syscalls.h    2012-08-28 14:19:47.171854560 +0800
@@ -860,4 +860,6 @@
 
 asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type,
                         unsigned long idx1, unsigned long idx2);
+
+asmlinkage long sys_helloworld(void);
 #endif

新增檔案實作新增系統呼叫。

diff -ruN linux-3.5.3/arch/x86/kernel/helloworld.c linux-3.5.3.new/arch/x86/kernel/helloworld.c
--- linux-3.5.3/arch/x86/kernel/helloworld.c    1970-01-01 08:00:00.000000000 +0800
+++ linux-3.5.3.new/arch/x86/kernel/helloworld.c        2012-08-28 14:30:34.139545617 +0800
@@ -0,0 +1,8 @@
+#include <linux/linkage.h>
+#include <linux/kernel.h>
+
+long sys_helloworld(void)
+{
+    printk("hello world from linux kernel!\n");
+    return 0;
+}

修改 arch/x86/kernel/Makefile。

diff -ruN linux-3.5.3/arch/x86/kernel/Makefile linux-3.5.3.new/arch/x86/kernel/Makefile
--- linux-3.5.3/arch/x86/kernel/Makefile        2012-08-26 10:32:13.000000000 +0800
+++ linux-3.5.3.new/arch/x86/kernel/Makefile    2012-08-28 14:34:55.763928000 +0800
@@ -34,6 +34,7 @@
 obj-y                  += tsc.o io_delay.o rtc.o
 obj-y                  += pci-iommu_table.o
 obj-y                  += resource.o
+obj-y                  += helloworld.o
 
 obj-y                          += process.o
 obj-y                          += i387.o xsave.o

運行範例。

#include <unistd.h>
#include <sys/syscall.h>
 
#define NR_SYSCALL 350
 
int main()
{
    return syscall(NR_SYSCALL);
}

例外

traps.c 裡的 trap_init 會設置例外的進入點。

void __init trap_init(void)
{
  set_intr_gate(0, &divide_error);
  set_intr_gate_ist(1, &debug, DEBUG_STACK);
  set_intr_gate_ist(2, &nmi, NMI_STACK);
 
  /* 略 */
}

entry_32.S 裡面包含例外處理函式的進入點。

ENTRY(divide_error)
  RING0_INT_FRAME
  pushl $0      # no error code
  CFI_ADJUST_CFA_OFFSET 4
  pushl $do_divide_error
  CFI_ADJUST_CFA_OFFSET 4
  jmp error_code
  CFI_ENDPROC
END(divide_error)

最後會到 traps.c 中對應的函式執行。和 FreeBSD 稍有不同，FreeBSD 統一在 /usr/src/sys/i386/i386/trap.c 中的 trap 函式處理。

#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr)   \
dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \
{                 \
  siginfo_t info;             \
  info.si_signo = signr;            \
  info.si_errno = 0;            \
  info.si_code = sicode;            \
  info.si_addr = (void __user *)siaddr;       \
  if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr)  \
              == NOTIFY_STOP) \
    return;             \
  conditional_sti(regs);            \
  do_trap(trapnr, signr, str, regs, error_code, &info);   \
}
 
DO_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)

A Few Things You Didn’t Know about Signals in Linux Part 1

進程

struct task_struct (include/linux/sched.h)

虛擬內存

注意! 內核中存取變數都是以虛擬位址存取，必要時透過 \_\_pa 巨集減去一個位移量，得到對映的物理位址。關於虛擬內存相關資料結構請見 9.2. The Memory Descriptor 和 9.3. Memory Regions。第 31 頁。Linux 內核源代碼情景分析 2.3 節。Linux 目前將頁表結構抽象成: pgd、pud、pmd 和 pte 四層頁表。pgd_t、pud_t、pmd_t 和 pte_t 代表的是其中的頁表項 (entry)。可以分別透過 pgd_offset、pud_offset、pmd_offset 和 pte_offset_map 取得。當前述頁表項為空時，分別呼叫 pud_alloc、pmd_alloc 和 pte_alloc_one 分配 pud、pmd、pte。同時配合權限構成相應的頁表項，如 mk_pmd 或是 mk_pte。

struct mm_struct (include/linux/mm_types.h)

描述進程的整體虛擬地址空間。

struct task_struct {
 
        ... 略 ...
 
        struct mm_struct *mm, *active_mm;
 
        ... 略 ...
};
 
struct mm_struct {
        // 指向進程第一個 VMA，之後可透過 VMA 中的 vm_next 遍歷進程所有的 VMA。
         struct vm_area_struct * mmap;           /* list of VMAs */
 
        ... 略 ...
};

struct vm_area_struct (include/linux/mm_types.h)

Linux 將進程的虛擬地址空間分成數個區塊 (area)。基本上 ELF 中的段 (segment) 會對應到一個 VMA，請見程序員的自我修養第 6 章: 可執行檔的裝載與進程。

struct vm_area_struct {
        struct mm_struct * vm_mm;       /* The address space we belong to. */
        unsigned long vm_start;         /* Our start address within vm_mm. */
        unsigned long vm_end;           /* The first byte after our end address
                                           within vm_mm. */
 
        /* linked list of VM areas per task, sorted by address */
        struct vm_area_struct *vm_next, *vm_prev;
 
        ... 略 ...
};

Linux 的 Virtual Memory Areas（VMA）：基本概念介紹
Linux 的 Virtual Memory Areas（VMA）：Process 與 VMA 整體觀念
小談 mmap() 與 VMA
vm_area_struct 结构

struct pglist_data (include/linux/mmzone.h)
- 每個處理器的物理內存稱之為節點 (node)。
struct zone (include/linux/mmzone.h)
- 節點再分為數個區域 (zone)。
struct page (include/linux/mm_types.h)
- 描述物理頁的資料結構。區域內含數個物理頁。

Chapter 6 Physical Page Allocation。Linux 基本上將系統上的物理內存分為數個節點 (node)，以 pd_data_t 表示，每個節點關連到一個處理器，這主要是用來適應 NUMA。節點又分為數個內存區域 (zone)，分為 ZONE_DMA、ZONE_NORMAL 和 ZONE_HIGHMEM，關於 Linux 如何描述物理內存，請見 Chapter 2 Describing Physical Memory。ZONE_DMA 是物理內存前 16 MB 的區域，供周邊使用; ZONE_NORMAL 是 16 - 896 MB 的物理內存，將會被內核映射至虛擬位址高位址處，也就是映射至內核空間; ZONE_HIGHMEM 是物理內存剩下的區域。區域有三種不同的水印 (watermark)，分別為 pages_high、pages_low 和 pages_min，代表該區域物理頁的使用量。kswapd 會在剩餘物理頁為 pages_low 時被喚醒，開始回收頁面直到剩餘物理頁為 pages_high 為止。如果前述回收頁面仍抵銷不了物理頁的消耗，導致剩餘物理頁為 pages_min 時，allocator 在分配物理頁的同時，也會做和 kswapd 一樣的工作，試圖同時回收物理頁。

Chapter 8 Slab Allocator 處理申請小塊內存的請求，避免內部破碎。

When a User Mode process asks for dynamic memory, it doesn't get additional page frames; instead, it gets the right to use a new range of linear addresses, which become part of its address space. This interval is called a "memory region."

為了快速定位有哪些頁表項指向該物理頁，Linux 使用 reverse mapping，請見 17.2. Reverse Mapping。直接在 struct page 維護指向此物理頁的頁表項，並非好的做法。目前的做法，內核會維護物理頁到 VMA 的反向連結，VMA 再透過 struct mm_struct 中的 pgd_t * pgd 遍歷該進程的頁表。

術語

Get Free Page (GFP)
SLOB (Simple List Of Blocks)
PAT (Page Attribute Table)
- 跟 x86 有關。

頁缺失

目標: 了解物理頁如何被內核釋放掉。

do_page_fault (arch/x86/mm/fault.c)。

/*
 * This routine handles page faults.  It determines the address,
 * and the problem, and then passes it off to one of the appropriate
 * routines.
 */
dotraplinkage void __kprobes
do_page_fault(struct pt_regs *regs, unsigned long error_code)
{
        ... 略 ...
 
        /* Get the faulting address: */
        address = read_cr2();
 
good_area:
        // 配置物理頁。
         fault = handle_mm_fault(mm, vma, address, flags);
 
        ... 略 ...
}

handle_mm_fault (mm/memory.c)。

int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                unsigned long address, unsigned int flags)
{
        ... 略 ...
 
 
        pte = pte_offset_map(pmd, address); // 此為最後一層的頁表 (page table)
 
        // 分配物理頁。
          return handle_pte_fault(mm, vma, address, pte, pmd, flags);
}

handle_pte_fault (mm/memory.c)。

int handle_pte_fault(struct mm_struct *mm,
                     struct vm_area_struct *vma, unsigned long address,
                     pte_t *pte, pmd_t *pmd, unsigned int flags)
{
        ... 略 ...
 
        entry = *pte;
        if (!pte_present(entry)) {
                // 該物理頁未被配置。
                   if (pte_none(entry)) {
                        // 該物理頁映射至檔案。
                        if (vma->vm_ops) {
                                if (likely(vma->vm_ops->fault))
                                        return do_linear_fault(mm, vma, address,
                                                pte, pmd, flags, entry);
                        }
                        return do_anonymous_page(mm, vma, address,
                                                 pte, pmd, flags);
                }
                   if (pte_file(entry))
                        return do_nonlinear_fault(mm, vma, address,
                                        pte, pmd, flags, entry);
                return do_swap_page(mm, vma, address,
                                        pte, pmd, flags, entry);
        }
 
        ... 略 ...
}

外部連結

We need a way to read from and write to page tables. This means accessing them by a virtual address, since we're using paging.

During initialization of the virtual memory manager, the last PDE in the page directory is set to the physical address of the page directory itself.

http://histemiss.blog.163.com/blog/static/30487860201251803955826/

進程內存用量

# 由於各式各樣複雜的原因，底下輸出僅供參考。
$ ps u -p `pidof a.out`
USER       PID %CPU %MEM    VSZ   RSS TTY      STAT START   TIME COMMAND
chenwj   31952  0.0  0.0   3884   472 pts/42   S+   14:51   0:00 ./a.out
# 關注最底下 writeable/private 的數據。
$ pmap -d `pidof a.out`
32280:   ./a.out
Address           Kbytes Mode  Offset           Device    Mapping
0000000000400000       4 r-x-- 0000000000000000 000:0000f a.out
0000000000600000       4 r---- 0000000000000000 000:0000f a.out
0000000000601000       4 rw--- 0000000000001000 000:0000f a.out
00000000020d5000     132 rw--- 0000000000000000 000:00000   [ anon ]
00007fa2458be000    1412 r-x-- 0000000000000000 0fe:00000 libc-2.12.2.so
00007fa245a1f000    2048 ----- 0000000000161000 0fe:00000 libc-2.12.2.so
00007fa245c1f000      16 r---- 0000000000161000 0fe:00000 libc-2.12.2.so
00007fa245c23000       4 rw--- 0000000000165000 0fe:00000 libc-2.12.2.so
00007fa245c24000      20 rw--- 0000000000000000 000:00000   [ anon ]
00007fa245c29000     120 r-x-- 0000000000000000 0fe:00000 ld-2.12.2.so
00007fa245e2f000      12 rw--- 0000000000000000 000:00000   [ anon ]
00007fa245e44000       8 rw--- 0000000000000000 000:00000   [ anon ]
00007fa245e46000       4 r---- 000000000001d000 0fe:00000 ld-2.12.2.so
00007fa245e47000       4 rw--- 000000000001e000 0fe:00000 ld-2.12.2.so
00007fa245e48000       4 rw--- 0000000000000000 000:00000   [ anon ]
00007fff29b9a000      84 rw--- 0000000000000000 000:00000   [ stack ]
00007fff29bff000       4 r-x-- 0000000000000000 000:00000   [ anon ]
ffffffffff600000       4 r-x-- 0000000000000000 000:00000   [ anon ]
mapped: 3888K    writeable/private: 272K    shared: 0K

RSS (Resident Set Size)
- 該進程物理內存用量。
VSZ (Virtual Size)
- 該進程虛擬內存用量。
Need explanation on Resident Set Size/Virtual Size
Measuring RAM usage of a program
Linux: How to measure actual memory usage of an application or process?
Understanding memory usage on Linux

傾印物理內存

/dev/mem 代表當前機器上物理內存的內容。一般是透過 mmap 將物理內存的某個區段映射至目前進程的虛擬地址空間，進程對該虛擬地址空間的讀寫，即代表對相映物理內存的讀寫，進而使得進程可以直接讀寫物理內存。通常情況下，是用作 MMIO。對 /dev/mem 而言，是以 byte 為單位定址，且其代表的是物理位址 ²⁾。mmap 映射 /dev/mem 有其限制，請見 accessing mmaped /dev/mem?³⁾。

#include <stdio.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <stdlib.h>
 
int main()
{
    // 透過 map_base 一次從物理內存讀一個 byte, halfword 或是 word。
    // 端看到時候是將 map_base 轉型成 unsigned char *、unsigned short * 或是 unsigned long *。
    // 注意! map_base 是讀取物理內存起始位址對映的虛擬位址。
    void *map_base; 
    unsigned long virt_addr; // 物理位址對映的虛擬位址。
    unsigned char val;       // 該物理位址開始一個 byte 的內容。
    int i, fd;
 
    fd = open("/dev/mem", O_RDWR|O_SYNC);
    if (fd == -1)
        abort();
 
    // 將 /dev/mem 起始位址 (物理位址) 0x20000 之後 0xff 的內容，
    // 映射至當前進程的虛擬位址空間。
    map_base = mmap(NULL, 0xff, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0x20000);
 
    if (!map_base)
        abort();
 
    for (i = 0; i < 0xff; ++i)
    {
        // 讀取當前物理內存 0x2000 - 0x200ff 的內容。
        virt_addr = (unsigned long)(map_base + i);
        val = *((unsigned char *)map_base + i);
        printf("virt_addr: 0x%08x val: 0x%x\t\t", virt_addr, val);
 
        // 寫入新值，再讀取物理內存 0x2000 - 0x200ff 的內容。
        *((unsigned char *)map_base + i) = i;
        val = *((unsigned char *)map_base + i)
        printf("virt_addr: 0x%08lx val: 0x%x\n", virt_addr, val);
    }
 
    close(fd);
 
    munmap(map_base, 0xff);
 
    return 0;
}

其它

Modifying current process' pte through /dev/mem? 的目的是要讓 4G 以上和以下的虛擬位址映射至同一個物理頁。目前是透過 /dev/mem 配合當前進程的 CR3，修改 4G 以下虛擬位址的頁表項，使其改指向 4G 以上虛擬位址所映射的物理頁。理論上，牽涉其中的物理頁，其相關資料結構 struct page 中的 count 和 mapcount 應做適當的更新。其中一種狀況是，被多個頁表項所映射的物理頁，其 mapcount 並未做相應的更新，這會導致進程結束時，內核回收其頁面時發現 mapcount 值有誤。關於頁面回收請見 linux 页面回收浅析和 Linux 2.6 中的页面回收与反向映射。rss_stat 是用來統計進程所使用的物理內存數量，也需要更新。

方法一

BUG: Bad page map in process mmap  pte:8000000007eb2067 pmd:07acb067
page:ffffea00001fac80 count:0 mapcount:-1 mapping:          (null) index:0x101b7b
page flags: 0x4000000000000014(referenced|dirty)
addr:0000000101b7b000 vm_flags:00100073 anon_vma:ffff880007ab0708 mapping:          (null) index:101b7b
Pid: 609, comm: mmap Tainted: G    B        3.5.3 #7
Call Trace:
 [<ffffffff8107abcc>] ? print_bad_pte+0x1d2/0x1ea
 [<ffffffff8107bf18>] ? unmap_single_vma+0x3a0/0x56d
 [<ffffffff8107c745>] ? unmap_vmas+0x2c/0x46
 [<ffffffff8108106b>] ? exit_mmap+0x6e/0xdd
 [<ffffffff8101cc4f>] ? do_page_fault+0x30f/0x348
 [<ffffffff81020ce6>] ? mmput+0x20/0xb4
 [<ffffffff810256ae>] ? exit_mm+0x105/0x110
 [<ffffffff8103bb6c>] ? hrtimer_try_to_cancel+0x67/0x70
 [<ffffffff81026b59>] ? do_exit+0x211/0x711
 [<ffffffff810272e0>] ? do_group_exit+0x76/0xa0
 [<ffffffff8102731c>] ? sys_exit_group+0x12/0x19
 [<ffffffff812f3662>] ? system_call_fastpath+0x16/0x1b
BUG: Bad rss-counter state mm:ffff880007a496c0 idx:0 val:-1
BUG: Bad rss-counter state mm:ffff880007a496c0 idx:1 val:1

system_call_fastpath -> sys_exit_group -> do_group_exit -> do_exit -> hrtimer_try_to_cancel
  -> exit_mm -> mmput (kernek/fork.c) -> do_page_fault -> exit_mmap (mm/mmap.c)
  -> unmap_vmas (mm/memory.c) -> unmap_single_vma -> print_bad_pte

進程退出，調用 mmput (kernek/fork.c) 回收進程使用的物理頁，並清空頁表。請見 Linux内核释放页表的过程。

void mmput(struct mm_struct *mm)
{
        might_sleep();
 
        if (atomic_dec_and_test(&mm->mm_users)) {
                uprobe_clear_state(mm);
                exit_aio(mm);
                ksm_exit(mm);
                khugepaged_exit(mm); /* must run before exit_mmap */
                exit_mmap(mm); /* error msg 1 */
                set_mm_exe_file(mm, NULL);
                if (!list_empty(&mm->mmlist)) {
                        spin_lock(&mmlist_lock);
                        list_del(&mm->mmlist);
                        spin_unlock(&mmlist_lock);
                }
                if (mm->binfmt)
                        module_put(mm->binfmt->module);
                mmdrop(mm); /* error msg 2 */ 
        }
}

exit_mmap (mm/mmap.c) 依序釋放進程的 VMA。9.3. Memory Regions 中的 9.3.5.3. The unmap_region() function 可以供作參考。

void exit_mmap(struct mm_struct *mm)
{
        // 存放平台特定的資訊以備 tlb_remove_page 回收物理頁使用。
        struct mmu_gather tlb;
 
        ... 略 ...
 
        vma = mm->mmap;
 
        lru_add_drain();
        flush_cache_mm(mm);
        // 初始化 mmu_gather，第三個參數為 1 代表我們欲銷毀整個虛擬空間。
         tlb_gather_mmu(&tlb, mm, 1);
 
        // 釋放所有 VMA 其所包含的物理頁。
         unmap_vmas(&tlb, vma, 0, -1);
 
        // 釋放頁表。      
         free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
        tlb_finish_mmu(&tlb, 0, -1);
 
        while (vma) {
                if (vma->vm_flags & VM_ACCOUNT)
                        nr_accounted += vma_pages(vma);
                vma = remove_vma(vma);
        }
 
}

unmap_vmas (mm/memory.c) 回收物理页。

void unmap_vmas(struct mmu_gather *tlb,
                struct vm_area_struct *vma, unsigned long start_addr,
                unsigned long end_addr)
{
        struct mm_struct *mm = vma->vm_mm;
 
        mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
        for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
                unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
        mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
}

unmap_single_vma (mm/memory.c)。

static void unmap_single_vma(struct mmu_gather *tlb,
                struct vm_area_struct *vma, unsigned long start_addr,
                unsigned long end_addr,
                struct zap_details *details)
{
        unsigned long start = max(vma->vm_start, start_addr);
        unsigned long end;
 
        if (start >= vma->vm_end)
                return;
        end = min(vma->vm_end, end_addr);
        if (end <= vma->vm_start)
                return;
 
        if (vma->vm_file)
                uprobe_munmap(vma, start, end);
 
        if (unlikely(is_pfn_mapping(vma)))
                untrack_pfn_vma(vma, 0, 0);
 
        if (start != end) {
                if (unlikely(is_vm_hugetlb_page(vma))) {
                        /*
                         * It is undesirable to test vma->vm_file as it
                         * should be non-null for valid hugetlb area.
                         * However, vm_file will be NULL in the error
                         * cleanup path of do_mmap_pgoff. When
                         * hugetlbfs ->mmap method fails,
                         * do_mmap_pgoff() nullifies vma->vm_file
                         * before calling this function to clean up.
                         * Since no pte has actually been setup, it is
                         * safe to do nothing in this case.
                         */
                        if (vma->vm_file)
                                unmap_hugepage_range(vma, start, end, NULL);
                } else
                        unmap_page_range(tlb, vma, start, end, details);
        }
}

unmap_page_range (mm/memory.c) 依次释放 pud，pmd 和 pte。

unmap_page_range → zap_pud_range → zap_pmd_range → zap_pte_range

static void unmap_page_range(struct mmu_gather *tlb,
                             struct vm_area_struct *vma,
                             unsigned long addr, unsigned long end,
                             struct zap_details *details)
{
        pgd_t *pgd;
        unsigned long next;
 
        if (details && !details->check_mapping && !details->nonlinear_vma)
                details = NULL;
 
        BUG_ON(addr >= end);
        mem_cgroup_uncharge_start();
        tlb_start_vma(tlb, vma);
        pgd = pgd_offset(vma->vm_mm, addr);
        do {
                next = pgd_addr_end(addr, end);
                if (pgd_none_or_clear_bad(pgd))
                        continue;
                next = zap_pud_range(tlb, vma, pgd, addr, next, details);
        } while (pgd++, addr = next, addr != end);
        tlb_end_vma(tlb, vma);
        mem_cgroup_uncharge_end();
}

zap_pte_range (mm/memory.c)。

static unsigned long zap_pte_range(struct mmu_gather *tlb,
                                struct vm_area_struct *vma, pmd_t *pmd,
                                unsigned long addr, unsigned long end,
                                struct zap_details *details)
{
    ... 略 ...
 
        do {
 
                if (pte_present(ptent)) {
                        struct page *page;
 
                        // 取回 page 結構。
                            page = vm_normal_page(vma, addr, ptent);
 
                        // 把 pte 清零。
                            ptent = ptep_get_and_clear_full(mm, addr, pte,
                                                        tlb->fullmm);
 
                        // 收集欲回收的 page。
                            tlb_remove_tlb_entry(tlb, pte, addr);
 
                        // 把 page 的引用计数减 1。
                            if (PageAnon(page))
                                rss[MM_ANONPAGES]--;
                        else {
                                if (pte_dirty(ptent))
                                        set_page_dirty(page);
                                if (pte_young(ptent) &&
                                    likely(!VM_SequentialReadHint(vma)))
                                        mark_page_accessed(page);
                                rss[MM_FILEPAGES]--;
                        }
 
                        // 清除物理頁至頁表項的反向映射。
                            page_remove_rmap(page);
                        // 理論上物理頁被頁表項指到的次數應大於或等於零。
                            // 若底下條件成立，代表該物理頁沒有被頁表項指到。
                            if (unlikely(page_mapcount(page) < 0))
                                print_bad_pte(vma, addr, ptent, page);
 
        } while (pte++, addr += PAGE_SIZE, addr != end);
 
        add_mm_rss_vec(mm, rss);
        arch_leave_lazy_mmu_mode();
        pte_unmap_unlock(start_pte, ptl);
 
        if (force_flush) {
                force_flush = 0;
                tlb_flush_mmu(tlb); // 釋放物理頁。
                   if (addr != end)
                        goto again;
        }
 
        return addr;
}

写时复制细节问题

free_pgtables (mm/memory.c)。

void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
                unsigned long floor, unsigned long ceiling)
{
        while (vma) {
                struct vm_area_struct *next = vma->vm_next;
                unsigned long addr = vma->vm_start;
 
                /*
                 * Hide vma from rmap and truncate_pagecache before freeing
                 * pgtables
                 */
                unlink_anon_vmas(vma);
                unlink_file_vma(vma);
 
                if (is_vm_hugetlb_page(vma)) {
                        hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
                                floor, next? next->vm_start: ceiling);
                } else {
                        /*
                         * Optimization: gather nearby vmas into one call down
                         */
                        while (next && next->vm_start <= vma->vm_end + PMD_SIZE
                               && !is_vm_hugetlb_page(next)) {
                                vma = next;
                                next = vma->vm_next;
                                unlink_anon_vmas(vma);
                                unlink_file_vma(vma);
                        }
                        free_pgd_range(tlb, addr, vma->vm_end,
                                floor, next? next->vm_start: ceiling);
                }
                vma = next;
        }
}

free_pgd_range (mm/memory.c)

void free_pgd_range(struct mmu_gather *tlb,
                        unsigned long addr, unsigned long end,
                        unsigned long floor, unsigned long ceiling)
{
        ... 略 ...
 
        pgd = pgd_offset(tlb->mm, addr);
        do {
                next = pgd_addr_end(addr, end);
                if (pgd_none_or_clear_bad(pgd))
                        continue;
                free_pud_range(tlb, pgd, addr, next, floor, ceiling);
        } while (pgd++, addr = next, addr != end);
}

free_pud_range

        pud = pud_offset(pgd, addr);
        do {
                next = pud_addr_end(addr, end);
                if (pud_none_or_clear_bad(pud))
                        continue;
                free_pmd_range(tlb, pud, addr, next, floor, ceiling);
        } while (pud++, addr = next, addr != end);

free_pmd_range (mm/memory.c)

static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
                                unsigned long addr, unsigned long end,
                                unsigned long floor, unsigned long ceiling)
{
        ... 略 ...
 
        pmd = pmd_offset(pud, addr);
        do {
                next = pmd_addr_end(addr, end);
                if (pmd_none_or_clear_bad(pmd))
                        continue;
                free_pte_range(tlb, pmd, addr);
        } while (pmd++, addr = next, addr != end);
 
 
        ... 略 ...
}

free_pte_range (mm/memory.c)

/*
 * Note: this doesn't free the actual pages themselves. That
 * has been handled earlier when unmapping all the memory regions.
 */
static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
                           unsigned long addr)
{
        pgtable_t token = pmd_pgtable(*pmd);
        pmd_clear(pmd);
        pte_free_tlb(tlb, token, addr);
        tlb->mm->nr_ptes--;
}

mmput 呼叫 mmdrop (kernel/fork.c)。

void __mmdrop(struct mm_struct *mm)
{
        BUG_ON(mm == &init_mm);
        mm_free_pgd(mm);
        destroy_context(mm);
        mmu_notifier_mm_destroy(mm);
        check_mm(mm);
        free_mm(mm);
}

check_mm (kernel/fork.c)。

static void check_mm(struct mm_struct *mm)
{
        int i;
 
        for (i = 0; i < NR_MM_COUNTERS; i++) {
                long x = atomic_long_read(&mm->rss_stat.count[i]);
 
                // 等同 if (x)，即 x != 0 時，會執行 if 語句。unlikely 是給編譯器提示，說明 if 語句不常執行。
                   if (unlikely(x))
                        printk(KERN_ALERT "BUG: Bad rss-counter state "
                                          "mm:%p idx:%d val:%ld\n", mm, i, x);
        }
}

mm_struct (include/linux/mm_types.h)。

struct mm_rss_stat {
        atomic_long_t count[NR_MM_COUNTERS];
};
 
struct mm_struct {
        struct vm_area_struct * mmap;           /* list of VMAs */
        struct rb_root mm_rb;
        struct vm_area_struct * mmap_cache;     /* last find_vma result */
 
        ... 略 ...
 
        struct mm_rss_stat rss_stat;
 
        ... 略 ...
};

方法二

# cat /proc/mtrr
reg00: base=0x0e0000000 ( 3584MB), size=  512MB, count=1: uncachable
# ./mmap
malloc vaddr: 0x00000001009a2010 val: 3
pte: 0x0000000007ee5000
mmap:624 map pfn RAM range req uncached-minus for [mem 0x07ee5000-0x07ee5fff], got write-back
malloc vaddr: 0x00000001009a2010 val: 10
pte: 0x8000000007ee5267

reserve_pfn_range (arch/x86/mm/pat.c)。

static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
                                int strict_prot)
{
    ... 略 ...
 
        if (is_ram) {
                if (!pat_enabled)
                        return 0;
 
                flags = lookup_memtype(paddr);
                if (want_flags != flags) {
                        printk(KERN_WARNING "%s:%d map pfn RAM range req %s for [mem %#010Lx-%#010Lx], got %s\n",
                                current->comm, current->pid,
                                cattr_name(want_flags),
                                (unsigned long long)paddr,
                                (unsigned long long)(paddr + size - 1),
                                cattr_name(flags));
                        *vma_prot = __pgprot((pgprot_val(*vma_prot) &
                                              (~_PAGE_CACHE_MASK)) |
                                             flags);
                }
                return 0;
        }
 
    ... 略 ...
}

Documentation/x86/pat.txt
Documentation/x86/mtrr.txt
- Speeding up the graphics on Pentium Pro / Pentium II computers

方法三

直接進入內核修改頁表項。必須在程序結束時，將手動分配的頁釋放掉。

鎖

RCU supports concurrency between a single updater and multiple readers.

Read-copy-update (RCU)

追蹤點

arch/x86/kvm/trace.h 裡面列出欲生成的追蹤點。

TRACE_EVENT(kvm_entry,
  TP_PROTO(unsigned int vcpu_id),
  TP_ARGS(vcpu_id),
 
  TP_STRUCT__entry(
    __field(  unsigned int, vcpu_id   )
  ),
 
  TP_fast_assign(
    __entry->vcpu_id  = vcpu_id;
  ),
 
  TP_printk("vcpu %u", __entry->vcpu_id)
);

Using the TRACE_EVENT() macro (Part 1)

在 vcpu_enter_guest (arch/x86/kvm/x86.c) 插入追蹤點。

static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
{
  ... 略 ...
 
  trace_kvm_entry(vcpu->vcpu_id);
  kvm_x86_ops->run(vcpu);
 
  ... 略 ...
}

驅動與模組

其它

Bottom Half

在處理中斷時，內核會禁用中斷以免影響當前正在運行的中斷處理函式。一般來說，會希望禁用中斷的時間盡可能的縮短，以加快系統的反應速度 (禁用中斷代表系統對外部中斷無法反應)。因此會將中斷處理函式分成 top half 和 bottom half，前者在禁用中斷的情況下執行，其執行時間短; 後者則是在開啟中斷的情況下運行。

Interrupts can come anytime, when the kernel may want to finish something else it was trying to do. The kernel's goal is therefore to get the interrupt out of the way as soon as possible and defer as much processing as it can. For instance, suppose a block of data has arrived on a network line. When the hardware interrupts the kernel, it could simply mark the presence of data, give the processor back to whatever was running before, and do the rest of the processing later (such as moving the data into a buffer where its recipient process can find it, and then restarting the process). The activities that the kernel needs to perform in response to an interrupt are thus divided into a critical urgent part that the kernel executes right away and a deferrable part that is left for later.

http://www.makelinux.net/books/ulk3/understandlk-CHP-4-SECT-1

模塊

MMIO

物理內存會有一個區段映射至裝置的暫存器和內存，這一物理內存同樣在頁表中有虛擬位址到物理位址的映射。如 Motherboard Chipsets and the Memory Map 一文所述，CPU 送出的物理位址是透過北橋來決定是存取內存還是裝置。

外部連結

¹⁾ http://ppc52776.blogspot.tw/2012/08/adding-new-system-call-in-linux-kernel.html

²⁾ http://people.cs.nctu.edu.tw/~chenwj/log/UNIX/nico103-2012-08-29.txt

³⁾ http://lwn.net/Articles/267427/

ELF 映像的装入

系統呼叫

添加系統呼叫

例外

進程

虛擬內存

術語

頁缺失

外部連結

進程內存用量

傾印物理內存

其它

方法一

方法二

方法三

鎖

追蹤點

驅動與模組

其它

Bottom Half

模塊

MMIO

外部連結

搜索

登录