0
点赞
收藏
分享

微信扫一扫

20150501 调试分析之 修改内核来定位系统僵死问题

孟祥忠诗歌 2022-03-30 阅读 105


20150501 调试分析之 修改内核来定位系统僵死问题

2015-05-01 Lover雪儿


今天还是研究内核调试,

死机,这个词语,大家应该不陌生.

当我们写程序,如果加入到内核中的程序中有出现死循环的话,启动内核运行程序会直接进入相对死机状态.

那么怎么可以解决这个问题呢?


我们都知道,我们人的心脏是一直跳动的,而恰恰如此,内核也有它的跳动,那就是tick中断,

所以我们可以从tick中断入手,解决上面的死机问题.


在开发板上运行cat /proc/interrupts 可以查看系统当前的各种中断号,

可以看到一个中断名为i.MX Timer Tick 的中断,那么它就是我们今天的主角.

1 root@EasyARM-iMX257 /mnt/nfs/module# cat /proc/interrupts 
2 CPU0
3 9: 0 - mxsdhci
4 14: 0 - CSPI_IRQ
5 25: 2 - imxdi - mxcsdma
6 35: 0 - ehci_hcd:usb1
7 37 2453 - mxcintuart
8 46: 3 - m - i.MX Timer Tick
9 57: 0 - mxsdhci
10 Err: 0



在内核中查找 Timer Tick的源代码,如下所示:

1 /* linux-2.6.31/arch/arm/plat-mxc/time.c
2 * IRQ handler for the timer
3 */
4 static irqreturn_t mxc_timer_interrupt(int irq, void *dev_id)
5 {
6 struct clock_event_device *evt = &clockevent_mxc;
7 uint32_t tstat;
8
9 if (timer_is_v2())
10 tstat = __raw_readl(timer_base + MX3_TSTAT);
11 else
12 tstat = __raw_readl(timer_base + MX1_2_TSTAT);
13
14 gpt_irq_acknowledge();
15
16 evt->event_handler(evt);
17
18 return IRQ_HANDLED;
19 }
20
21 static struct irqaction mxc_timer_irq = {
22 .name = "i.MX Timer Tick",
23 .flags = IRQF_DISABLED | IRQF_TIMER | IRQF_IRQPOLL,
24 .handler = mxc_timer_interrupt,
25 };


在这个函数中,我们可以增加一些代码:有点类似看门狗



.mxc_timer_interrupt中增加打印语句


在mxc_timer_interrupt 中断函数中检测系统当前正在运行的中断,如果10S之内都是同一个进程正在运行的话,那就我们就把这个进程打印出来(先从简单入手,此处先不做太多的复杂事情)

步骤:

①首先备份 linux-2.6.31/arch/arm/plat-mxc/time.c,

②接着修改time.c的内容,

③最后编译内核,重新给板子启动新内核



root@Lover雪:/home/study/nfs_home/system/linux-2.6.31/arch/arm/plat-mxc# cp time.c time.c.bak

修改time.c,再中断函数中加入打印语句

root@Lover雪:/home/study/nfs_home/system/linux-2.6.31/arch/arm/plat-mxc# vi time.c

************************************************************************************************

root@Lover雪:/home/study/nfs_home/system/linux-2.6.31/arch/arm/plat-mxc# cd ../../..

编译内核

root@Lover雪:/home/study/nfs_home/system/linux-2.6.31# make uImage

CHK include/linux/version.h

make[1]: 'include/asm-arm/mach-types.h' is up to date.

CHK include/linux/utsrelease.h

SYMLINK include/asm -> include/asm-arm

************************************************************************************************

Data Size: 2180620 Bytes = 2129.51 kB = 2.08 MB

Load Address: 80008000

Entry Point: 80008000

Image arch/arm/boot/uImage is ready

root@Lover雪:/home/study/nfs_home/system/linux-2.6.31# cp arch/arm/boot/uImage /tftpboot/uImage

root@Lover雪:/home/study/nfs_home/system/linux-2.6.31#

************************************************************************************************

在开发板上重新烧写内核

MX25 U-Boot > run upsystem

FEC: enable RMII gasket

ver 192.168.31.179; our IP address is 192.168.31.180

Filename '00

Loading: #################################################################

#################################################################

###################

done

************************************************************************************************

加载完毕后,如果不动开发板,会发现,每隔10s钟,就会有进程pid=0,名字name=swapper的打印消息.

root@EasyARM-iMX257 ~# mxc_timer_interrupt: pid = 0, name = swapper

root@EasyARM-iMX257 ~# mxc_timer_interrupt: pid = 0, name = swapper

root@EasyARM-iMX257 ~#



修改time.c如下所示:

1 /*   linux-2.6.31/arch/arm/plat-mxc/time.c
2 * IRQ handler for the timer
3 */
4 static irqreturn_t mxc_timer_interrupt(int irq, void *dev_id)
5 {
6 struct clock_event_device *evt = &clockevent_mxc;
7 uint32_t tstat;
8
9 static pid_t pre_pid;
10 static int cnt = 0;
11 if(pre_pid == current->pid){
12 cnt++;
13 }else{
14 cnt = 0;
15 pre_pid = current->pid;
16 }
17 if(cnt == 10*HZ){
18 cnt = 0;
19 printk("mxc_timer_interrupt: pid = %d, name = %s\n",current->pid, current->comm);
20 }
21 //
22 if (timer_is_v2())
23 tstat = __raw_readl(timer_base + MX3_TSTAT);
24 else
25 tstat = __raw_readl(timer_base + MX1_2_TSTAT);
26
27 gpt_irq_acknowledge();
28
29 evt->event_handler(evt);
30
31 return IRQ_HANDLED;
32 }


.修改错误代码,在代码中增加死循环

还是沿用我们前面的err_led.c的驱动程序.

再open函数中,我们故意加入一个死循环.

/* err_led.c
*/
44 static int key_open(struct inode *inode, struct file *file)
45 {
46 printk("<0>function open!\n\n");
47 //在此加入一个死循环
48 while(1);
49 return 0;
50 }



编译接着在开发板中加载错误驱动程序,使用cat 命令打开设备.


root@EasyARM-iMX257 ~# ifconfig eth0 192.168.31.181;mount -t nfs 192.168.31.179:

/home/study/nfs_home /mnt/nfs -o nolock;cd /mnt/nfs/module/

root@EasyARM-iMX257 /mnt/nfs/module#

root@EasyARM-iMX257 /mnt/nfs/module# cd 39_debug_with_timer/

root@EasyARM-iMX257 /mnt/nfs/module/39_debug_with_timer# insmod err_led.ko


Hello,this is err_led_dev module!


addr base_iomux : c4a26000

addr base_gpio3 : c4a2a000

addCTL : c4a26270

addr GDIR_GPIO3a2a000

root@EasyARM-iMX257 /mnt/nfs/module/39_debug_with_timer# cat /dev/err_led_dev

function open!

#################################################################

可以发现,打开设备后,进入open函数,系统直接进入死机状态,每格10s中便会打印出我们的进程号pid=1805

mxc_timer_interrupt: pid = 1805, name = cat

mxc_timer_interrupt: pid = 1805, name = cat

mxc_timer_interrupt: pid = 1805, name = cat



.修改错误代码,在代码中增加死循环


接着恢复上面的time.c的代码,我们找到linux-2.6.31/arch/arm/kernel/irq.c文件中找打系统中断总调用者asm_do_IRQ,

我们在asm_do_IRQ函数里加入前面time.c中的打印代码.


root@Lover雪:/home/study/nfs_home/system/linux-2.6.31# cd arch/arm/plat-mxc/

root@Lover雪:/home/study/nfs_home/system/linux-2.6.31/arch/arm/plat-mxc# mv time.c.bak time.c

root@Lover雪:/home/study/nfs_home/system/linux-2.6.31/arch/arm/plat-mxc# cd ..

root@Lover雪:/home/study/nfs_home/system/linux-2.6.31/arch/arm# cd kernel/

root@Lover雪:/home/study/nfs_home/system/linux-2.6.31/arch/arm/kernel# vi irq.c

root@Lover雪:/home/study/nfs_home/system/linux-2.6.31/arch/arm/kernel# cd ../../../

root@Lover雪:/home/study/nfs_home/system/linux-2.6.31# make uImage

########################################################

Load Address: 80008000

Entry Point: 80008000

Image arch/arm/boot/uImage is ready

root@Lover雪:/home/study/nfs_home/system/linux-2.6.31# cp arch/arm/boot/uImage /tftpboot/uImage

root@Lover雪:/home/study/nfs_home/system/linux-2.6.31#


########################################################

从开发板重新烧写新内核

启动开发板



Irq.c修改内容如下:

1 /* linux-2.6.31/arch/arm/kernel/irq.c
2 * do_IRQ handles all hardware IRQ's. Decoded IRQs should not
3 * come via this function. Instead, they should provide their
4 * own 'handler'
5 */
6 asmlinkage void __exception asm_do_IRQ(unsigned int irq, struct pt_regs *regs)
7 {
8 struct pt_regs *old_regs = set_irq_regs(regs);
9
10 //从 cat /proc/interrupts 中得到我们的tick中断为46
11 if(irq == 46)
12 {
13
14 static pid_t pre_pid;
15 static int cnt = 0;
16 if(pre_pid == current->pid){
17 cnt++;
18 }else{
19 cnt = 0;
20 pre_pid = current->pid;
21 }
22 if(cnt == 10*HZ){
23 cnt = 0;
24 printk("asm_do_IRQ => mxc_timer_interrupt: pid = %d, name = %s\n",current->pid, current->comm);
25 printk("pc = %08x\n",regs->ARM_pc);//ptract.h
26 }
27 /
28 }
29
30
31 irq_enter();
32
33 /*
34 * Some hardware gives randomly wrong interrupts. Rather
35 * than crashing, do something sensible.
36 */
37 if (unlikely(irq >= NR_IRQS)) {
38 if (printk_ratelimit())
39 printk(KERN_WARNING "Bad IRQ%u\n", irq);
40 ack_bad_irq(irq);
41 } else {
42 generic_handle_irq(irq);
43 }
44
45 /* AT91 specific workaround */
46 irq_finish(irq);
47
48 irq_exit();
49 set_irq_regs(old_regs);
50 }


启动开发板,加载错误的驱动程序,根据打印出来的PC值来反推错误地址:


root@EasyARM-iMX257 /mnt/nfs/module/39_debug_with_timer# insmod err_led.ko

root@EasyARM-iMX257 /mnt/nfs/module/39_debug_with_timer# cat /dev/err_led_dev

function open!


根据打印出来的pc值,再 cat /proc/kallmps,找到错误的驱动err_led.ko,对其进行反汇编,然后找到错误的函数,进而反推出c语言代码出错位置.


92 00000130 <key_open>:

93 130: e52de004 str lr, [sp, #-4]!

94 134: e59f0008 ldr r0, [pc, #8] ; 144 <.text+0x144>

95 138: e24dd004 sub sp, sp, #4 ; 0x4

96 13c: ebfffffe bl 0 <printk>

97 140: eafffffe b 140 <key_open+0x10> //很容易就找到了错误地址,此处一直b 140就为死循环

98 144: 000000cc andeq r0, r0, ip, asr #1

99




步骤和前面的博客文章

<​​20150430 ​​调试分析之 根据内核报错信息PC​​指针分析错误​​>一样了,


如果要调试应用程序,可以使用strace,具体的用法,百度上有很详细的解释


附上驱动程序err_led.c

20150501 调试分析之 修改内核来定位系统僵死问题_linux20150501 调试分析之 修改内核来定位系统僵死问题_linux_02

1 #include<linux/cdev.h>
2 #include<linux/module.h>
3 #include<linux/types.h>
4 #include<linux/fs.h>
5 #include<linux/errno.h>
6 #include<linux/mm.h>
7 #include<linux/sched.h>
8 #include<linux/init.h>
9 #include<asm/io.h>
10 #include<asm/system.h>
11 #include<asm/uaccess.h>
12 #include<linux/device.h>
13 #include <linux/delay.h>
14
15 #define Driver_NAME "err_led_dev"
16 #define DEVICE_NAME "err_led_dev"
17
18 static int major = 0;
19
20 #define LED_ON 0
21 #define LED_OFF 1
22
23
24 //auto to create device node
25 static struct class *drv_class = NULL;
26 static struct class_device *drv_class_dev = NULL;
27
28 //寄存器基址;
29 static unsigned long mem_iomux;
30 static unsigned long mem_gpio3;
31 static unsigned long base_iomux; //iomux基址 0X 43FA C000 - 0X 43FA FFFF
32 static unsigned long base_gpio3; //gpio3 0X 53FA 4000 - 0X 53FA 7FFF
33 // MUX_CTL模式选择 配置寄存器
34 #define MUX_CTL (*(volatile unsigned long *)(base_iomux + 0x0060))
35 // PAD_CTL GPIO常用功能设置
36 #define PAD_CTL (*(volatile unsigned long *)(base_iomux + 0x0270))
37 // GPIO DR 数据寄存器 DR
38 #define DR_GPIO3 (*(volatile unsigned long *)(base_gpio3 + 0x0000))
39 // GPIO GDIR 方向控制寄存器 GDIR
40 #define GDIR_GPIO3 (*(volatile unsigned long *)(base_gpio3 + 0x0004))
41
42
43 static int key_open(struct inode *inode, struct file *file)
44 {
45 printk("<0>function open!\n\n");
46
47 //在此加入一个死循环
48 while(1);
49
50 return 0;
51 }
52
53 static int key_read(struct file *filp, char __user *buff, size_t count, loff_t *offp)
54 {
55 return 0;
56 }
57
58 static ssize_t key_write(struct file *file, const char __user *buf, size_t count, loff_t * ppos)
59 {
60 printk("<0>function write!\n\n");
61 return 1;
62 }
63
64 static int key_release(struct inode *inode, struct file *filp)
65 {
66 printk("<0>function write!\n\n");
67 return 0;
68 }
69
70 static int key_ioctl(struct inode *inode,struct file *flip,unsigned int command,unsigned long arg)
71 {
72 printk("<0>function ioctl!\n\n");
73
74 switch(command)
75 {
76 case LED_ON:
77 DR_GPIO3 &= ~(0x01 << 23); //将GPIO2_23清零 亮
78 break;
79 case LED_OFF:
80
81 DR_GPIO3 |= (0x01 << 23); //将GPIO2_23置1 灭
82 break;
83 default:
84 break;
85 }
86
87 return 0;
88 }
89 static struct file_operations key_fops = {
90 .owner = THIS_MODULE, /* 这是一个宏,推向编译模块时自动创建的__this_module变量 */
91 .open = key_open,
92 .read = key_read,
93 .write = key_write,
94 .release= key_release,
95 .ioctl = key_ioctl,
96 };
97
98 void gpio_addr(void){
99 printk("<0>addr base_iomux : %x \n",base_iomux);
100 printk("<0>addr base_gpio3 : %x \n",base_gpio3);
101 printk("<0>addr MUX_CTL : %x \n",&MUX_CTL);
102 printk("<0>addr PAD_CTL : %x \n",&PAD_CTL);
103 printk("<0>addr GDIR_GPIO3 : %x \n",&GDIR_GPIO3);
104 printk("<0>addr DR_GPIO3 : %x \n",&DR_GPIO3);
105 }
106
107
108
109 void led_on_off(void){
110 ssleep(1);
111 DR_GPIO3 |= (0x01 << 23); //将GPIO2_23置1
112 ssleep(1);
113 DR_GPIO3 &= ~(0x01 << 23); //将GPIO2_23清零
114 ssleep(1);
115 DR_GPIO3 |= (0x01 << 23); //将GPIO2_23置1
116 ssleep(1);
117 DR_GPIO3 &= ~(0x01 << 23); //将GPIO2_23清零
118 ssleep(1);
119 DR_GPIO3 |= (0x01 << 23); //将GPIO2_23置1
120 ssleep(1);
121 DR_GPIO3 &= ~(0x01 << 23); //将GPIO2_23清零
122 ssleep(1);
123 DR_GPIO3 |= (0x01 << 23); //将GPIO2_23置1
124 ssleep(1);
125 DR_GPIO3 &= ~(0x01 << 23); //将GPIO2_23清零
126 ssleep(1);
127 DR_GPIO3 |= (0x01 << 23); //将GPIO2_23置1
128 }
129
130 static int __init key_irq_init(void)
131 {
132 printk("<0>\nHello,this is %s module!\n\n",Driver_NAME);
133 //register and mknod
134 major = register_chrdev(0,Driver_NAME,&key_fops);
135 drv_class = class_create(THIS_MODULE,Driver_NAME);
136 drv_class_dev = device_create(drv_class,NULL,MKDEV(major,0),NULL,DEVICE_NAME); /*/dev/key_query*/
137
138 //IO端口申请 ioremap 可以直接通过指针来访问这些地址
139 base_iomux = ioremap(0x43FAC000,0xFFF);
140 base_gpio3 = ioremap(0x53FA4000,0xFFF);
141
142 //MUX_CTL
143 MUX_CTL &= ~(0x07 << 0);
144 MUX_CTL |= (0X05 << 0); //设置为ALT5 GPIO3_23 ERR_LED
145 //PAD_CTL
146 PAD_CTL &= ~(0x01<<13 | 0x01<<3 | 0x03<<1 | 0x01<<0); //1.8v 不需要上拉下拉 CMOS输出 slew rate
147 //GDIR_GPIO3 配置为输出模式
148 GDIR_GPIO3 &= ~(0x01 << 23);
149 GDIR_GPIO3 |= (0x01 << 23); //配置为输出模式
150
151 //DR_GPIO3 配置为输出0 点亮ERR_LED
152 DR_GPIO3 &= ~(0x01 << 23); //将GPIO2_23清零
153 DR_GPIO3 &= ~(0x01 << 23); //将GPIO2_23清零
154 gpio_addr();
155 led_on_off();
156 return 0;
157 }
158
159 static void __exit key_irq_exit(void)
160 {
161 gpio_addr();
162 printk("<0>\nGoodbye,%s!\n\n",Driver_NAME);
163 led_on_off();
164
165 unregister_chrdev(major,Driver_NAME);
166 device_unregister(drv_class_dev);
167 class_destroy(drv_class);
168
169 //释放IO端口
170 iounmap(base_iomux);
171 iounmap(base_gpio3);
172 }
173
174
175 /* 这两行指定驱动程序的初始化函数和卸载函数 */
176 module_init(key_irq_init);
177 module_exit(key_irq_exit);
178
179 /* 描述驱动程序的一些信息,不是必须的 */
180 MODULE_AUTHOR("Lover雪儿");
181 MODULE_VERSION("0.1.0");
182 MODULE_DESCRIPTION("IMX257 key Driver");
183 MODULE_LICENSE("GPL");

err_led.ko



举报

相关推荐

0 条评论