[转]Linux内核分了哪些段
MMU除了管理虚拟地址到物理地址的映射外,还管理的访问权限。Linux内核分了很多段,如代码段,只读数据段,数据段等,不同的段访问权限肯定是不同的。代码段一般来说可读可执行,并且只能在特权模式下执行,但是不可写;只读数据段只能读,不能写也不能执行;数据段可以读写,不能执行。
我们首先要知道linux分了哪些段,以ARM64为例,我们看下编译生成的连接脚本vmlinux.lds文件。
. = ((((0xffffffffffffffff << (48)) + (0)) + (0x08000000))) + 0x00080000; .head.text : { _text = .; *(.head.text) } .text : { /* Real text segment */ _stext = .; /* Text and read-only data */ __exception_text_start = .; *(.exception.text) __exception_text_end = .; . = ALIGN(8); __entry_text_start = .; *(.entry.text) __entry_text_end = .; . = ALIGN(8); *(.text.hot .text .text.fixup .text.unlikely) *(.ref.text) . = ALIGN(8); __sched_text_start = .; *(.sched.text) __sched_text_end = .; . = ALIGN(8); __cpuidle_text_start = .; *(.cpuidle.text) __cpuidle_text_end = .; . = ALIGN(8); __lock_text_start = .; *(.spinlock.text) __lock_text_end = .; . = ALIGN(8); __kprobes_text_start = .; *(.kprobes.text) __kprobes_text_end = .; . = ALIGN(0x00001000); __hyp_idmap_text_start = .; *(.hyp.idmap.text) __hyp_idmap_text_end = .; __hyp_text_start = .; *(.hyp.text) __hyp_text_end = .; . = ALIGN(0x00001000); __idmap_text_start = .; *(.idmap.text) __idmap_text_end = .; *(.fixup) *(.gnu.warning) . = ALIGN(16); *(.got) /* Global offset table */ } . = ALIGN(0x00010000); _etext = .; /* End of text section */ . = ALIGN(((1 << 12))); .rodata : AT(ADDR(.rodata) - 0) { __start_rodata = .; *(.rodata) *(.rodata.*) __start_data_ro_after_init = .; *(.data..ro_after_init) __end_data_ro_after_init = .; *(__vermagic) . = ALIGN(8); __start___tracepoints_ptrs = .; *(__tracepoints_ptrs) __stop___tracepoints_ptrs = .; *(__tracepoints_strings) } .rodata1 : AT(ADDR(.rodata1) - 0) { *(.rodata1) } . = ALIGN(8); __bug_table : AT(ADDR(__bug_table) - 0) { __start___bug_table = .; *(__bug_table) __stop___bug_table = .; } .pci_fixup : AT(ADDR(.pci_fixup) - 0) { __start_pci_fixups_early = .; *(.pci_fixup_early) __end_pci_fixups_early = .; __start_pci_fixups_header = .; *(.pci_fixup_header) __end_pci_fixups_header = .; __start_pci_fixups_final = .; *(.pci_fixup_final) __end_pci_fixups_final = .; __start_pci_fixups_enable = .; *(.pci_fixup_enable) __end_pci_fixups_enable = .; __start_pci_fixups_resume = .; *(.pci_fixup_resume) __end_pci_fixups_resume = .; __start_pci_fixups_resume_early = .; *(.pci_fixup_resume_early) __end_pci_fixups_resume_early = .; __start_pci_fixups_suspend = .; *(.pci_fixup_suspend) __end_pci_fixups_suspend = .; __start_pci_fixups_suspend_late = .; *(.pci_fixup_suspend_late) __end_pci_fixups_suspend_late = .; } .builtin_fw : AT(ADDR(.builtin_fw) - 0) { __start_builtin_fw = .; *(.builtin_fw) __end_builtin_fw = .; } __ksymtab : AT(ADDR(__ksymtab) - 0) { __start___ksymtab = .; KEEP(*(SORT(___ksymtab+*))) __stop___ksymtab = .; } __ksymtab_gpl : AT(ADDR(__ksymtab_gpl) - 0) { __start___ksymtab_gpl = .; KEEP(*(SORT(___ksymtab_gpl+*))) __stop___ksymtab_gpl = .; } __ksymtab_unused : AT(ADDR(__ksymtab_unused) - 0) { __start___ksymtab_unused = .; KEEP(*(SORT(___ksymtab_unused+*))) __stop___ksymtab_unused = .; } __ksymtab_unused_gpl : AT(ADDR(__ksymtab_unused_gpl) - 0) { __start___ksymtab_unused_gpl = .; KEEP(*(SORT(___ksymtab_unused_gpl+*))) __stop___ksymtab_unused_gpl = .; } __ksymtab_gpl_future : AT(ADDR(__ksymtab_gpl_future) - 0) { __start___ksymtab_gpl_future = .; KEEP(*(SORT(___ksymtab_gpl_future+*))) __stop___ksymtab_gpl_future = .; } __kcrctab : AT(ADDR(__kcrctab) - 0) { __start___kcrctab = .; KEEP(*(SORT(___kcrctab+*))) __stop___kcrctab = .; } __kcrctab_gpl : AT(ADDR(__kcrctab_gpl) - 0) { __start___kcrctab_gpl = .; KEEP(*(SORT(___kcrctab_gpl+*))) __stop___kcrctab_gpl = .; } __kcrctab_unused : AT(ADDR(__kcrctab_unused) - 0) { __start___kcrctab_unused = .; KEEP(*(SORT(___kcrctab_unused+*))) __stop___kcrctab_unused = .; } __kcrctab_unused_gpl : AT(ADDR(__kcrctab_unused_gpl) - 0) { __start___kcrctab_unused_gpl = .; KEEP(*(SORT(___kcrctab_unused_gpl+*))) __stop___kcrctab_unused_gpl = .; } __kcrctab_gpl_future : AT(ADDR(__kcrctab_gpl_future) - 0) { __start___kcrctab_gpl_future = .; KEEP(*(SORT(___kcrctab_gpl_future+*))) __stop___kcrctab_gpl_future = .; } __ksymtab_strings : AT(ADDR(__ksymtab_strings) - 0) { KEEP(*(__ksymtab_strings)) } __init_rodata : AT(ADDR(__init_rodata) - 0) { *(.ref.rodata) } __param : AT(ADDR(__param) - 0) { __start___param = .; *(__param) __stop___param = .; } __modver : AT(ADDR(__modver) - 0) { __start___modver = .; *(__modver) __stop___modver = .; . = ALIGN(((1 << 12))); __end_rodata = .; } . = ALIGN(((1 << 12))); /* everything from this point to */ . = ALIGN(8); __ex_table : AT(ADDR(__ex_table) - 0) { __start___ex_table = .; *(__ex_table) __stop___ex_table = .; } /* __init_begin will be marked RO NX */ .notes : AT(ADDR(.notes) - 0) { __start_notes = .; *(.note.*) __stop_notes = .; } . = ALIGN(0x00010000); __init_begin = .; . = ALIGN(8); .init.text : AT(ADDR(.init.text) - 0) { _sinittext = .; *(.init.text) *(.text.startup) *(.meminit.text) _einittext = .; } .exit.text : { *(.exit.text) *(.text.exit) *(.memexit.text) } .init.data : { KEEP(*(SORT(___kentry+*))) *(.init.data) *(.meminit.data) *(.init.rodata) *(.meminit.rodata) . = ALIGN(8); __clk_of_table = .; *(__clk_of_table) *(__clk_of_table_end) . = ALIGN(8); __reservedmem_of_table = .; *(__reservedmem_of_table) *(__reservedmem_of_table_end) . = ALIGN(8); __clksrc_of_table = .; *(__clksrc_of_table) *(__clksrc_of_table_end) . = ALIGN(8); __iommu_of_table = .; *(__iommu_of_table) *(__iommu_of_table_end) . = ALIGN(8); __cpu_method_of_table = .; *(__cpu_method_of_table) *(__cpu_method_of_table_end) . = ALIGN(8); __cpuidle_method_of_table = .; *(__cpuidle_method_of_table) *(__cpuidle_method_of_table_end) . = ALIGN(32); __dtb_start = .; *(.dtb.init.rodata) __dtb_end = .; . = ALIGN(8); __irqchip_of_table = .; *(__irqchip_of_table) *(__irqchip_of_table_end) . = ALIGN(8); __irqchip_acpi_probe_table = .; *(__irqchip_acpi_probe_table) __irqchip_acpi_probe_table_end = .; . = ALIGN(8); __clksrc_acpi_probe_table = .; *(__clksrc_acpi_probe_table) __clksrc_acpi_probe_table_end = .; . = ALIGN(32); __earlycon_table = .; *(__earlycon_table) __earlycon_table_end = .; . = ALIGN(16); __setup_start = .; *(.init.setup) __setup_end = .; __initcall_start = .; KEEP(*(.initcallearly.init)) __initcall0_start = .; KEEP(*(.initcall0.init)) KEEP(*(.initcall0s.init)) __initcall1_start = .; KEEP(*(.initcall1.init)) KEEP(*(.initcall1s.init)) __initcall2_start = .; KEEP(*(.initcall2.init)) KEEP(*(.initcall2s.init)) __initcall3_start = .; KEEP(*(.initcall3.init)) KEEP(*(.initcall3s.init)) __initcall4_start = .; KEEP(*(.initcall4.init)) KEEP(*(.initcall4s.init)) __initcall5_start = .; KEEP(*(.initcall5.init)) KEEP(*(.initcall5s.init)) __initcallrootfs_start = .; KEEP(*(.initcallrootfs.init)) KEEP(*(.initcallrootfss.init)) __initcall6_start = .; KEEP(*(.initcall6.init)) KEEP(*(.initcall6s.init)) __initcall7_start = .; KEEP(*(.initcall7.init)) KEEP(*(.initcall7s.init)) __initcall_end = .; __con_initcall_start = .; KEEP(*(.con_initcall.init)) __con_initcall_end = .; __security_initcall_start = .; KEEP(*(.security_initcall.init)) __security_initcall_end = .; . = ALIGN(4); __initramfs_start = .; KEEP(*(.init.ramfs)) . = ALIGN(8); KEEP(*(.init.ramfs.info)) *(.init.rodata.* .init.bss) /* from the EFI stub */ } .exit.data : { *(.exit.data) *(.fini_array) *(.dtors) *(.memexit.data) *(.memexit.rodata) } . = ALIGN((1 << 12)); .data..percpu : AT(ADDR(.data..percpu) - 0) { __per_cpu_load = .; __per_cpu_start = .; *(.data..percpu..first) . = ALIGN((1 << 12)); *(.data..percpu..page_aligned) . = ALIGN((1 << 7)); *(.data..percpu..read_mostly) . = ALIGN((1 << 7)); *(.data..percpu) *(.data..percpu..shared_aligned) __per_cpu_end = .; } . = ALIGN(4); .altinstructions : { __alt_instructions = .; *(.altinstructions) __alt_instructions_end = .; } .altinstr_replacement : { *(.altinstr_replacement) } .rela : ALIGN(8) { *(.rela .rela*) } __rela_offset = ABSOLUTE(ADDR(.rela) - ((((0xffffffffffffffff << (48)) + (0)) + (0x08000000)))); __rela_size = SIZEOF(.rela); . = ALIGN(0x00010000); __init_end = .; _data = .; _sdata = .; . = ALIGN((1 << 12)); .data : AT(ADDR(.data) - 0) { . = ALIGN(16384); __start_init_task = .; *(.data..init_task) __end_init_task = .; . = ALIGN((1 << 12)); __nosave_begin = .; *(.data..nosave) . = ALIGN((1 << 12)); __nosave_end = .; . = ALIGN((1 << 12)); *(.data..page_aligned) . = ALIGN((1 << 7)); *(.data..cacheline_aligned) . = ALIGN((1 << 7)); *(.data..read_mostly) . = ALIGN((1 << 7)); *(.data .data.[0-9a-zA-Z_]*) *(.ref.data) *(.data..shared_aligned) *(.data.unlikely) . = ALIGN(32); *(__tracepoints) . = ALIGN(8); __start___jump_table = .; *(__jump_table) __stop___jump_table = .; . = ALIGN(8); __start___verbose = .; *(__verbose) __stop___verbose = .; CONSTRUCTORS } /* * Data written with the MMU off but read with the MMU on requires * cache lines to be invalidated, discarding up to a Cache Writeback * Granule (CWG) of data from the cache. Keep the section that * requires this type of maintenance to be in its own Cache Writeback * Granule (CWG) area so the cache maintenance operations don't * interfere with adjacent data. */ .mmuoff.data.write : ALIGN(0x00000800) { __mmuoff_data_start = .; *(.mmuoff.data.write) } . = ALIGN(0x00000800); .mmuoff.data.read : { *(.mmuoff.data.read) __mmuoff_data_end = .; } .pecoff_edata_padding : { BYTE(0); . = ALIGN(PECOFF_FILE_ALIGNMENT); } _edata = .; . = ALIGN(0); __bss_start = .; . = ALIGN(0); .sbss : AT(ADDR(.sbss) - 0) { *(.sbss) *(.scommon) } . = ALIGN(0); .bss : AT(ADDR(.bss) - 0) { *(.bss..page_aligned) *(.dynbss) *(.bss .bss.[0-9a-zA-Z_]*) *(COMMON) } . = ALIGN(0); __bss_stop = .; . = ALIGN((1 << 12)); idmap_pg_dir = .; . += ((((((48)) - 4) / (12 - 3)) - 1) * (1 << 12)); swapper_pg_dir = .; . += ((4 - 1) * (1 << 12)); _end = .; .stab 0 : { *(.stab) } .stabstr 0 : { *(.stabstr) } .stab.excl 0 : { *(.stab.excl) } .stab.exclstr 0 : { *(.stab.exclstr) } .stab.index 0 : { *(.stab.index) } .stab.indexstr 0 : { *(.stab.indexstr) } .comment 0 : { *(.comment) } _kernel_size_le_lo32 = (((_end - _text) & 0xffffffff) & 0xffffffff); _kernel_size_le_hi32 = (((_end - _text) >> 32) & 0xffffffff); _kernel_offset_le_lo32 = (((0x00080000) & 0xffffffff) & 0xffffffff); _kernel_offset_le_hi32 = (((0x00080000) >> 32) & 0xffffffff); _kernel_flags_le_lo32 = (((((0 << 0) | (((12 - 10) / 2) << 1) | (1 << 3))) & 0xffffffff) & 0xffffffff); _kernel_flags_le_hi32 = (((((0 << 0) | (((12 - 10) / 2) << 1) | (1 << 3))) >> 32) & 0xffffffff); } /* * The HYP init code and ID map text can't be longer than a page each, * and should not cross a page boundary. */ ASSERT(__hyp_idmap_text_end - (__hyp_idmap_text_start & ~(0x00001000 - 1)) <= 0x00001000, "HYP init code too big or misaligned") ASSERT(__idmap_text_end - (__idmap_text_start & ~(0x00001000 - 1)) <= 0x00001000, "ID map text too big or misaligned") /* * If padding is applied before .head.text, virt<->phys conversions will fail. */ ASSERT(_text == (((((0xffffffffffffffff << (48)) + (0)) + (0x08000000))) + 0x00080000), "HEAD is misaligned")
- _text – _etext之间是代码段
- __start_rodata到__init_begin之间是只读数据段
- __init_begin到__init_end之间是内核初始化相关的段,包括代码和数据,这个段的特点是只在内核初始化的时候用,内核初始化完成后就可以释放了。
- _data和_end之间是可读可写的数据段各个段的权限设定
各个段的权限设定
以ARM64为例,内核各个段权限的设置在arch/arm64/mm/mmu.c的map_kernel中定义。
static void __init map_kernel(pgd_t *pgd) { static struct vm_struct vmlinux_text, vmlinux_rodata, vmlinux_init, vmlinux_data; map_kernel_segment(pgd, _text, _etext, PAGE_KERNEL_EXEC, &vmlinux_text); map_kernel_segment(pgd, __start_rodata, __init_begin, PAGE_KERNEL, &vmlinux_rodata); map_kernel_segment(pgd, __init_begin, __init_end, PAGE_KERNEL_EXEC, &vmlinux_init); map_kernel_segment(pgd, _data, _end, PAGE_KERNEL, &vmlinux_data); if (!pgd_val(*pgd_offset_raw(pgd, FIXADDR_START))) { /* * The fixmap falls in a separate pgd to the kernel, and doesn't * live in the carveout for the swapper_pg_dir. We can simply * re-use the existing dir for the fixmap. */ set_pgd(pgd_offset_raw(pgd, FIXADDR_START), *pgd_offset_k(FIXADDR_START)); } else if (CONFIG_PGTABLE_LEVELS > 3) { /* * The fixmap shares its top level pgd entry with the kernel * mapping. This can really only occur when we are running * with 16k/4 levels, so we can simply reuse the pud level * entry instead. */ BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES)); set_pud(pud_set_fixmap_offset(pgd, FIXADDR_START), __pud(__pa(bm_pmd) | PUD_TYPE_TABLE)); pud_clear_fixmap(); } else { BUG(); } kasan_copy_shadow(pgd); }
- 代码段权限设定
#define PAGE_KERNEL_EXEC __pgprot(_PAGE_DEFAULT | PTE_UXN | PTE_DIRTY | PTE_WRITE)
PTE_UXN:指明非特权模式下不可执行,没有指明PTE_RDONLY,所以代码段是可以写的,这样岂不是很危险?
- 只读数据段权限设定
#define PAGE_KERNEL __pgprot(_PAGE_DEFAULT | PTE_PXN | PTE_UXN | PTE_DIRTY | PTE_WRITE)
PTE_PXN指明特权模式下不可执行,PTE_UXN指明非特权模式下不可执行,但是。。。没有指明PTE_RDONLY,只读数据段也可以写?什么鬼?
- 初始化段权限设定
#define PAGE_KERNEL_EXEC __pgprot(_PAGE_DEFAULT | PTE_UXN | PTE_DIRTY | PTE_WRITE)
初始化段同时包括数据和代码,这里设置为跟代码段权限一致,但是依然是可写的,不过初始化完成后这个段会被释放,我们姑且认为安全隐患比较小。
- 数据段权限设定
#define PAGE_KERNEL __pgprot(_PAGE_DEFAULT | PTE_PXN | PTE_UXN | PTE_DIRTY | PTE_WRITE)
可读写,不可执行,这个看起来比较正常。
看到这里可以肯定的是,事情没有那么简单,权限设定绝不可能仅仅在此处。代码段必须是只读的,否则存放很大的安全漏洞,如果内核代码被恶意程序修改了,那么CPU将很容易被黑客控制,所以代码段应该被设置为PAGE_KERNEL_ROX属性,只读,可特权模式下执行,而只读数据段应该被设置为PAGE_KERNEL_RO只读。果不其然,全局搜索一下这两个宏,果然发现在mark_readonly函数中,会将代码段和只读数据段的权限分别设置为PAGE_KERNEL_ROX和PAGE_KERNEL_RO。
那么为什么不一开始就直接设置为只读呢?这里其实是内核给用户提供了一个便利,如果启动内核的时候通过bootargs设置rodata=false,则不会将只读数据段和代码段设置为只读,这应该是方便调试,或者用于一些特殊的场景,如果没有设置,则默认将代码段和只读数据段设置为只读模式。
原文链接:https://blog.csdn.net/liuhangtiant/article/details/98475799