[OpenAFS-devel] SIGSEGV in host.c, 1.4.1b, twice in past two days
Kim Kimball
kim.kimball@jpl.nasa.gov
Tue, 13 Jun 2006 13:58:25 -0600
One of our recently installed 1.4.1b fileservers dumped core in host.c=20
yesterday (6/12/2006) and again today about half an hour ago.
Both cores print to h_gethostcps_r; I must be missing something, because=20
it looks like the call to h_gethost_cps is correct but the pointer to=20
the host structure is magically (since I can't figure out where it=20
happens :) null when the function starts to execute.
Kim Kimball
dhk at ccre.com
bash-2.05$ dbx fileserver=20
core_jplis-fil-afs32_fileserver_0_0_1150115729_2944
For information about new features see `help changes'
To remove this message, put `dbxenv suppress_startup_message 7.4' in=20
your .dbxrc
Reading fileserver
core file header read successfully
Reading ld.so.1
Reading libpthread.so.1
Reading libsocket.so.1
Reading libresolv.so.2
Reading libnsl.so.1
Reading libintl.so.1
Reading libdl.so.1
Reading libc.so.1
Reading libmp.so.2
Reading libc_psr.so.1
Reading libthread.so.1
Reading nss_files.so.1
WARNING!!
A loadobject was found with an unexpected checksum value.
See `help core mismatch' for details, and run `proc -map'
to see what checksum values were expected and found.
dbx: warning: Some symbolic information might be incorrect.
t@15 (l@15) terminated by signal SEGV (no mapping at the fault address)
Current function is h_gethostcps_r
490 host->hcpsfailed =3D 0;
(dbx) where
current thread: t@15
=3D>[1] h_gethostcps_r(host =3D (nil), now =3D 1150115729), line 490 in "=
host.c"
[2] h_Lookup_r(haddr =3D 2303680610U, hport =3D 7001U, heldp =3D 0xfe2fb=
a54),=20
line 638 in "host.c"
[3] h_GetHost_r(tcon =3D 0x11f3688), line 1118 in "host.c"
[4] h_FindClient_r(tcon =3D 0x11f3688), line 1676 in "host.c"
[5] CallPreamble(acall =3D 0x11a2340, activecall =3D 0, tconn =3D 0xfe2f=
bd34,=20
ahostp =3D 0xfe2fbd30), line 317 in "afsfileprocs.c"
[6] SRXAFS_GetTime(acall =3D 0x11a2340, Seconds =3D 0xfe2fbdb4, USeconds=
=3D=20
0xfe2fbdb0), line 6793 in "afsfileprocs.c"
[7] _RXAFS_GetTime(z_call =3D 0x11a2340, z_xdrs =3D 0xfe2fbe38), line 11=
09=20
in "afsint.ss.c"
[8] RXAFS_ExecuteRequest(z_call =3D 0x11a2340), line 1941 in "afsint.ss.=
c"
[9] rxi_ServerProc(threadID =3D 98, newcall =3D (nil), socketp =3D=20
0xfe2fbf34), line 1407 in "rx.c"
[10] rx_ServerProc(), line 300 in "rx_pthread.c"
[11] server_entry(argp =3D 0xc84a0), line 98 in "rx_pthread.c"
(dbx) dump
slept =3D 0
code =3D 0
host =3D (nil)
now =3D 1150115729
(dbx) list
490 host->hcpsfailed =3D 0;
491
492 host->hostFlags &=3D ~HCPS_INPROGRESS;
493 /* signal all who are waiting */
494 if (host->hostFlags & HCPS_WAITING) { /* somebody is=20
waiting */
495 host->hostFlags &=3D ~HCPS_WAITING;
496 #ifdef AFS_PTHREAD_ENV
497 assert(pthread_cond_broadcast(&host->cond) =3D=3D 0);
498 #else /* AFS_PTHREAD_ENV */
499 if ((code =3D LWP_NoYieldSignal(&(host->hostFlags))) !=3D=20
LWP_SUCCESS)
(dbx) print host
host =3D (nil)
(dbx) print *host
dbx: reference through nil pointer
(dbx) # Assigned to null pointer to structure
(dbx) up
Current function is h_Lookup_r
638 h_gethostcps_r(host, now);
(dbx) dump
index =3D 98
haddr =3D 2303680610U
hport =3D 7001U
heldp =3D 0xfe2fba54
host =3D 0x113e628
now =3D 1150115729
chain =3D 0x11b37b0
(dbx) list
638 h_gethostcps_r(host, now);
639 }
640 break;
641 }
642 host =3D NULL;
643 }
644 return host;
645
646 } /*h_Lookup */
647
(dbx) where
current thread: t@15
[1] h_gethostcps_r(host =3D (nil), now =3D 1150115729), line 490 in "hos=
t.c"
=3D>[2] h_Lookup_r(haddr =3D 2303680610U, hport =3D 7001U, heldp =3D=20
0xfe2fba54), line 638 in "host.c"
[3] h_GetHost_r(tcon =3D 0x11f3688), line 1118 in "host.c"
[4] h_FindClient_r(tcon =3D 0x11f3688), line 1676 in "host.c"
[5] CallPreamble(acall =3D 0x11a2340, activecall =3D 0, tconn =3D 0xfe2f=
bd34,=20
ahostp =3D 0xfe2fbd30), line 317 in "afsfileprocs.c"
[6] SRXAFS_GetTime(acall =3D 0x11a2340, Seconds =3D 0xfe2fbdb4, USeconds=
=3D=20
0xfe2fbdb0), line 6793 in "afsfileprocs.c"
[7] _RXAFS_GetTime(z_call =3D 0x11a2340, z_xdrs =3D 0xfe2fbe38), line 11=
09=20
in "afsint.ss.c"
[8] RXAFS_ExecuteRequest(z_call =3D 0x11a2340), line 1941 in "afsint.ss.=
c"
[9] rxi_ServerProc(threadID =3D 98, newcall =3D (nil), socketp =3D=20
0xfe2fbf34), line 1407 in "rx.c"
[10] rx_ServerProc(), line 300 in "rx_pthread.c"
[11] server_entry(argp =3D 0xc84a0), line 98 in "rx_pthread.c"
(dbx) up
Current function is h_GetHost_r
1118 host =3D h_Lookup_r(haddr, hport, &held);
(dbx) dump
interfValid =3D 0
hoststr =3D "=BC=FE/=B9\030"
haddr =3D 2303680610U
interf =3D RECORD
hoststr2 =3D "LS=FE/=B8=B8=FF\031\037=E4=FE/=B9=D4=FE/"
oldHost =3D 0x1
identP =3D (nil)
caps =3D RECORD
hport =3D 7001U
cb_conn =3D (nil)
code =3D 0
host =3D (nil)
oheld =3D 98
held =3D 0
tcon =3D 0x11f3688
(dbx) list
1118 host =3D h_Lookup_r(haddr, hport, &held);
1119 identP =3D (struct Identity *)rx_GetSpecific(tcon,=20
rxcon_ident_key);
1120 if (host && !identP && !(host->Console & 1)) {
1121 /* This is a new connection, and we already have a host
1122 * structure for this address. Verify that the identity
1123 * of the caller matches the identity in the host structur=
e.
1124 */
1125 h_Lock_r(host);
1126 if (!(host->hostFlags & ALTADDR)) {
1127 /* Another thread is doing initialization */
(dbx) where
current thread: t@15
[1] h_gethostcps_r(host =3D (nil), now =3D 1150115729), line 490 in "hos=
t.c"
[2] h_Lookup_r(haddr =3D 2303680610U, hport =3D 7001U, heldp =3D 0xfe2fb=
a54),=20
line 638 in "host.c"
=3D>[3] h_GetHost_r(tcon =3D 0x11f3688), line 1118 in "host.c"
[4] h_FindClient_r(tcon =3D 0x11f3688), line 1676 in "host.c"
[5] CallPreamble(acall =3D 0x11a2340, activecall =3D 0, tconn =3D 0xfe2f=
bd34,=20
ahostp =3D 0xfe2fbd30), line 317 in "afsfileprocs.c"
[6] SRXAFS_GetTime(acall =3D 0x11a2340, Seconds =3D 0xfe2fbdb4, USeconds=
=3D=20
0xfe2fbdb0), line 6793 in "afsfileprocs.c"
[7] _RXAFS_GetTime(z_call =3D 0x11a2340, z_xdrs =3D 0xfe2fbe38), line 11=
09=20
in "afsint.ss.c"
[8] RXAFS_ExecuteRequest(z_call =3D 0x11a2340), line 1941 in "afsint.ss.=
c"
[9] rxi_ServerProc(threadID =3D 98, newcall =3D (nil), socketp =3D=20
0xfe2fbf34), line 1407 in "rx.c"
[10] rx_ServerProc(), line 300 in "rx_pthread.c"
[11] server_entry(argp =3D 0xc84a0), line 98 in "rx_pthread.c"
(dbx) up
Current function is h_FindClient_r
1676 host =3D h_GetHost_r(tcon); /* Returns it h_Held */
(dbx) dump
tinst =3D ""
client =3D (nil)
expTime =3D 2147483647
created =3D 0
code =3D 1
host =3D 0x543878
fail =3D 0
tname =3D ""
tcell =3D ""
authClass =3D 0
tcon =3D 0x11f3688
viceid =3D 32766
oldClient =3D (nil)
uname =3D ""
(dbx) list
1676 host =3D h_GetHost_r(tcon); /* Returns it h_Held */
1677
1678 retryfirstclient:
1679 /* First try to find the client structure */
1680 for (client =3D host->FirstClient; client; client =3D=20
client->next) {
1681 if (!client->deleted && (client->sid =3D=3D rxr_CidOf(=
tcon))
1682 && (client->VenusEpoch =3D=3D rxr_GetEpoch(tcon)))=
{
1683 if (client->tcon && (client->tcon !=3D tcon)) {
1684 ViceLog(0,
1685 ("*** Vid=3D%d, sid=3D%x, tcon=3D%x,=20
Tcon=3D%x ***\n",
(dbx) up
Current function is CallPreamble
317 tclient =3D h_FindClient_r(*tconn);
(dbx) where
current thread: t@15
[1] h_gethostcps_r(host =3D (nil), now =3D 1150115729), line 490 in "hos=
t.c"
[2] h_Lookup_r(haddr =3D 2303680610U, hport =3D 7001U, heldp =3D 0xfe2fb=
a54),=20
line 638 in "host.c"
[3] h_GetHost_r(tcon =3D 0x11f3688), line 1118 in "host.c"
[4] h_FindClient_r(tcon =3D 0x11f3688), line 1676 in "host.c"
=3D>[5] CallPreamble(acall =3D 0x11a2340, activecall =3D 0, tconn =3D=20
0xfe2fbd34, ahostp =3D 0xfe2fbd30), line 317 in "afsfileprocs.c"
[6] SRXAFS_GetTime(acall =3D 0x11a2340, Seconds =3D 0xfe2fbdb4, USeconds=
=3D=20
0xfe2fbdb0), line 6793 in "afsfileprocs.c"
[7] _RXAFS_GetTime(z_call =3D 0x11a2340, z_xdrs =3D 0xfe2fbe38), line 11=
09=20
in "afsint.ss.c"
[8] RXAFS_ExecuteRequest(z_call =3D 0x11a2340), line 1941 in "afsint.ss.=
c"
[9] rxi_ServerProc(threadID =3D 98, newcall =3D (nil), socketp =3D=20
0xfe2fbf34), line 1407 in "rx.c"
[10] rx_ServerProc(), line 300 in "rx_pthread.c"
[11] server_entry(argp =3D 0xc84a0), line 98 in "rx_pthread.c"
(dbx) dump
hoststr =3D "=FE/=BD\034"
tconn =3D 0xfe2fbd34
hoststr2 =3D ""
thost =3D (nil)
code =3D 0
retry_flag =3D 1
activecall =3D 0
ahostp =3D 0xfe2fbd30
acall =3D 0x11a2340
tclient =3D 0xc31ea8
(dbx) list
317 tclient =3D h_FindClient_r(*tconn);
318 thost =3D tclient->host;
319 if (tclient->prfail =3D=3D 1) { /* couldn't get the CPS */
320 if (!retry_flag) {
321 h_ReleaseClient_r(tclient);
322 h_Release_r(thost);
323 ViceLog(0, ("CallPreamble: Couldn't get CPS. Fail\n"))=
;
324 H_UNLOCK;
325 return -1001;
326 }
(dbx) up
Current function is SRXAFS_GetTime
6793 if ((code =3D CallPreamble(acall, NOTACTIVECALL, &tcon, &thost=
)))
(dbx) where
current thread: t@15
[1] h_gethostcps_r(host =3D (nil), now =3D 1150115729), line 490 in "hos=
t.c"
[2] h_Lookup_r(haddr =3D 2303680610U, hport =3D 7001U, heldp =3D 0xfe2fb=
a54),=20
line 638 in "host.c"
[3] h_GetHost_r(tcon =3D 0x11f3688), line 1118 in "host.c"
[4] h_FindClient_r(tcon =3D 0x11f3688), line 1676 in "host.c"
[5] CallPreamble(acall =3D 0x11a2340, activecall =3D 0, tconn =3D 0xfe2f=
bd34,=20
ahostp =3D 0xfe2fbd30), line 317 in "afsfileprocs.c"
=3D>[6] SRXAFS_GetTime(acall =3D 0x11a2340, Seconds =3D 0xfe2fbdb4, USeco=
nds =3D=20
0xfe2fbdb0), line 6793 in "afsfileprocs.c"
[7] _RXAFS_GetTime(z_call =3D 0x11a2340, z_xdrs =3D 0xfe2fbe38), line 11=
09=20
in "afsint.ss.c"
[8] RXAFS_ExecuteRequest(z_call =3D 0x11a2340), line 1941 in "afsint.ss.=
c"
[9] rxi_ServerProc(threadID =3D 98, newcall =3D (nil), socketp =3D=20
0xfe2fbf34), line 1407 in "rx.c"
[10] rx_ServerProc(), line 300 in "rx_pthread.c"
[11] server_entry(argp =3D 0xc84a0), line 98 in "rx_pthread.c"
(dbx) dump
elapsedTime =3D RECORD
opStartTime =3D RECORD
opP =3D 0x167aa4
thost =3D 0x10cae80
code =3D 0
tpl =3D RECORD
USeconds =3D 0xfe2fbdb0
tcon =3D 0x11f3688
opStopTime =3D RECORD
acall =3D 0x11a2340
Seconds =3D 0xfe2fbdb4
(dbx) list
6793 if ((code =3D CallPreamble(acall, NOTACTIVECALL, &tcon, &thost=
)))
6794 goto Bad_GetTime;
6795
6796 FS_LOCK;
6797 AFSCallStats.GetTime++, AFSCallStats.TotalCalls++;
6798 FS_UNLOCK;
6799 TM_GetTimeOfDay(&tpl, 0);
6800 *Seconds =3D tpl.tv_sec;
6801 *USeconds =3D tpl.tv_usec;
6802
(dbx)
And today
bash-2.05$ dbx fileserver=20
core_jplis-fil-afs32_fileserver_0_0_1150225984_3201
For information about new features see `help changes'
To remove this message, put `dbxenv suppress_startup_message 7.4' in=20
your .dbxrc
Reading fileserver
core file header read successfully
Reading ld.so.1
Reading libpthread.so.1
Reading libsocket.so.1
Reading libresolv.so.2
Reading libnsl.so.1
Reading libintl.so.1
Reading libdl.so.1
Reading libc.so.1
Reading libmp.so.2
Reading libc_psr.so.1
Reading libthread.so.1
Reading nss_files.so.1
WARNING!!
A loadobject was found with an unexpected checksum value.
See `help core mismatch' for details, and run `proc -map'
to see what checksum values were expected and found.
dbx: warning: Some symbolic information might be incorrect.
t@104 (l@104) terminated by signal SEGV (no mapping at the fault address)
Current function is h_gethostcps_r
490 host->hcpsfailed =3D 0;
(dbx) where
current thread: t@104
=3D>[1] h_gethostcps_r(host =3D (nil), now =3D 1150225984), line 490 in "=
host.c"
[2] h_Lookup_r(haddr =3D 2313442094U, hport =3D 7001U, heldp =3D=20
0xf89fba54), line 638 in "host.c"
[3] h_GetHost_r(tcon =3D 0x137fef0), line 1118 in "host.c"
[4] h_FindClient_r(tcon =3D 0x137fef0), line 1676 in "host.c"
[5] CallPreamble(acall =3D 0x1267388, activecall =3D 0, tconn =3D=20
0xf89fbd34, ahostp =3D 0xf89fbd30), line 317 in "afsfileprocs.c"
[6] SRXAFS_GetTime(acall =3D 0x1267388, Seconds =3D 0xf89fbdb4, USecond=
s =3D=20
0xf89fbdb0), line 6793 in "afsfileprocs.c"
[7] _RXAFS_GetTime(z_call =3D 0x1267388, z_xdrs =3D 0xf89fbe38), line 1=
109=20
in "afsint.ss.c"
[8] RXAFS_ExecuteRequest(z_call =3D 0x1267388), line 1941 in "afsint.ss=
.c"
[9] rxi_ServerProc(threadID =3D 50, newcall =3D (nil), socketp =3D=20
0xf89fbf34), line 1407 in "rx.c"
[10] rx_ServerProc(), line 300 in "rx_pthread.c"
[11] server_entry(argp =3D 0xc84a0), line 98 in "rx_pthread.c"
(dbx) up
Current function is h_Lookup_r
638 h_gethostcps_r(host, now);
(dbx) print host
host =3D 0x11f35d0
(dbx) print *host
*host =3D {
next =3D 0x12da4a8
prev =3D 0x12e63a0
callback_rxcon =3D 0x1301108
holds =3D (0, 262144, 0, 134217728, 0)
host =3D 2313442094U
port =3D 7001U
Console =3D '\0'
hostFlags =3D 69U
InSameNetwork =3D '\0'
dummy =3D ""
hcpsfailed =3D '\0'
hcps =3D {
prlist_len =3D 6U
prlist_val =3D 0x11c67a8
}
LastCall =3D 1150225384U
ActiveCall =3D 1150214360U
FirstClient =3D 0x12c2ff0
cpsCall =3D 1150225984U
interface =3D 0x12991a0
cblist =3D 401U
index =3D 962U
lock =3D {
wait_states =3D '\0'
excl_locked =3D '\0'
readers_reading =3D '\0'
num_waiting =3D '\0'
mutex =3D {
__pthread_mutex_flags =3D {
__pthread_mutex_flag1 =3D 4U
__pthread_mutex_flag2 =3D '\0'
__pthread_mutex_ceiling =3D '\0'
__pthread_mutex_type =3D 0
__pthread_mutex_magic =3D 19800U
}
__pthread_mutex_lock =3D {
__pthread_mutex_lock64 =3D {
__pthread_mutex_pad =3D ""
}
__pthread_mutex_lock32 =3D {
__pthread_ownerpid =3D 0
__pthread_lockword =3D 0
}
__pthread_mutex_owner64 =3D 0
}
__pthread_mutex_data =3D 0
}
read_cv =3D {
__pthread_cond_flags =3D {
__pthread_cond_flag =3D ""
__pthread_cond_type =3D 0
__pthread_cond_magic =3D 17238U
}
__pthread_cond_data =3D 0
}
write_cv =3D {
__pthread_cond_flags =3D {
__pthread_cond_flag =3D ""
__pthread_cond_type =3D 0
__pthread_cond_magic =3D 17238U
}
__pthread_cond_data =3D 0
}
}
cond =3D {
__pthread_cond_flags =3D {
__pthread_cond_flag =3D ""
__pthread_cond_type =3D 0
__pthread_cond_magic =3D 17238U
}
__pthread_cond_data =3D 0
}
}
(dbx) down
Current function is h_gethostcps_r
490 host->hcpsfailed =3D 0;
(dbx) print host
host =3D (nil)
(dbx) print *host
dbx: reference through nil pointer
(dbx)