[OpenAFS-devel] SIGSEGV in host.c, 1.4.1b, twice in past two days

Kim Kimball kim.kimball@jpl.nasa.gov
Tue, 13 Jun 2006 13:58:25 -0600


One of our recently installed 1.4.1b fileservers dumped core in host.c=20
yesterday (6/12/2006)  and again today about half an hour ago.

Both cores print to h_gethostcps_r; I must be missing something, because=20
it looks like the call to h_gethost_cps is correct but the pointer to=20
the host structure is magically (since I can't figure out where it=20
happens :)  null when the function starts to execute.

Kim Kimball

dhk at ccre.com

bash-2.05$ dbx  fileserver=20
core_jplis-fil-afs32_fileserver_0_0_1150115729_2944
For information about new features see `help changes'
To remove this message, put `dbxenv suppress_startup_message 7.4' in=20
your .dbxrc
Reading fileserver
core file header read successfully
Reading ld.so.1
Reading libpthread.so.1
Reading libsocket.so.1
Reading libresolv.so.2
Reading libnsl.so.1
Reading libintl.so.1
Reading libdl.so.1
Reading libc.so.1
Reading libmp.so.2
Reading libc_psr.so.1
Reading libthread.so.1
Reading nss_files.so.1
WARNING!!
A loadobject was found with an unexpected checksum value.
See `help core mismatch' for details, and run `proc -map'
to see what checksum values were expected and found.
dbx: warning: Some symbolic information might be incorrect.
t@15 (l@15) terminated by signal SEGV (no mapping at the fault address)
Current function is h_gethostcps_r
 490           host->hcpsfailed =3D 0;
(dbx) where
current thread: t@15
=3D>[1] h_gethostcps_r(host =3D (nil), now =3D 1150115729), line 490 in "=
host.c"
 [2] h_Lookup_r(haddr =3D 2303680610U, hport =3D 7001U, heldp =3D 0xfe2fb=
a54),=20
line 638 in "host.c"
 [3] h_GetHost_r(tcon =3D 0x11f3688), line 1118 in "host.c"
 [4] h_FindClient_r(tcon =3D 0x11f3688), line 1676 in "host.c"
 [5] CallPreamble(acall =3D 0x11a2340, activecall =3D 0, tconn =3D 0xfe2f=
bd34,=20
ahostp =3D 0xfe2fbd30), line 317 in "afsfileprocs.c"
 [6] SRXAFS_GetTime(acall =3D 0x11a2340, Seconds =3D 0xfe2fbdb4, USeconds=
 =3D=20
0xfe2fbdb0), line 6793 in "afsfileprocs.c"
 [7] _RXAFS_GetTime(z_call =3D 0x11a2340, z_xdrs =3D 0xfe2fbe38), line 11=
09=20
in "afsint.ss.c"
 [8] RXAFS_ExecuteRequest(z_call =3D 0x11a2340), line 1941 in "afsint.ss.=
c"
 [9] rxi_ServerProc(threadID =3D 98, newcall =3D (nil), socketp =3D=20
0xfe2fbf34), line 1407 in "rx.c"
 [10] rx_ServerProc(), line 300 in "rx_pthread.c"
 [11] server_entry(argp =3D 0xc84a0), line 98 in "rx_pthread.c"
(dbx) dump
slept =3D 0
code =3D 0
host =3D (nil)
now =3D 1150115729
(dbx) list
 490           host->hcpsfailed =3D 0;
 491
 492       host->hostFlags &=3D ~HCPS_INPROGRESS;
 493       /* signal all who are waiting */
 494       if (host->hostFlags & HCPS_WAITING) {       /* somebody is=20
waiting */
 495           host->hostFlags &=3D ~HCPS_WAITING;
 496   #ifdef AFS_PTHREAD_ENV
 497           assert(pthread_cond_broadcast(&host->cond) =3D=3D 0);
 498   #else /* AFS_PTHREAD_ENV */
 499           if ((code =3D LWP_NoYieldSignal(&(host->hostFlags))) !=3D=20
LWP_SUCCESS)
(dbx) print host
host =3D (nil)
(dbx) print *host
dbx: reference through nil pointer
(dbx) # Assigned to null pointer to structure
(dbx) up
Current function is h_Lookup_r
 638                   h_gethostcps_r(host, now);
(dbx) dump
index =3D 98
haddr =3D 2303680610U
hport =3D 7001U
heldp =3D 0xfe2fba54
host =3D 0x113e628
now =3D 1150115729
chain =3D 0x11b37b0
(dbx) list
 638                   h_gethostcps_r(host, now);
 639               }
 640               break;
 641           }
 642           host =3D NULL;
 643       }
 644       return host;
 645
 646   }                               /*h_Lookup */
 647
(dbx) where
current thread: t@15
 [1] h_gethostcps_r(host =3D (nil), now =3D 1150115729), line 490 in "hos=
t.c"
=3D>[2] h_Lookup_r(haddr =3D 2303680610U, hport =3D 7001U, heldp =3D=20
0xfe2fba54), line 638 in "host.c"
 [3] h_GetHost_r(tcon =3D 0x11f3688), line 1118 in "host.c"
 [4] h_FindClient_r(tcon =3D 0x11f3688), line 1676 in "host.c"
 [5] CallPreamble(acall =3D 0x11a2340, activecall =3D 0, tconn =3D 0xfe2f=
bd34,=20
ahostp =3D 0xfe2fbd30), line 317 in "afsfileprocs.c"
 [6] SRXAFS_GetTime(acall =3D 0x11a2340, Seconds =3D 0xfe2fbdb4, USeconds=
 =3D=20
0xfe2fbdb0), line 6793 in "afsfileprocs.c"
 [7] _RXAFS_GetTime(z_call =3D 0x11a2340, z_xdrs =3D 0xfe2fbe38), line 11=
09=20
in "afsint.ss.c"
 [8] RXAFS_ExecuteRequest(z_call =3D 0x11a2340), line 1941 in "afsint.ss.=
c"
 [9] rxi_ServerProc(threadID =3D 98, newcall =3D (nil), socketp =3D=20
0xfe2fbf34), line 1407 in "rx.c"
 [10] rx_ServerProc(), line 300 in "rx_pthread.c"
 [11] server_entry(argp =3D 0xc84a0), line 98 in "rx_pthread.c"
(dbx) up
Current function is h_GetHost_r
1118       host =3D h_Lookup_r(haddr, hport, &held);
(dbx) dump
interfValid =3D 0
hoststr =3D "=BC=FE/=B9\030"
haddr =3D 2303680610U
interf =3D RECORD
hoststr2 =3D "LS=FE/=B8=B8=FF\031\037=E4=FE/=B9=D4=FE/"
oldHost =3D 0x1
identP =3D (nil)
caps =3D RECORD
hport =3D 7001U
cb_conn =3D (nil)
code =3D 0
host =3D (nil)
oheld =3D 98
held =3D 0
tcon =3D 0x11f3688
(dbx) list
1118       host =3D h_Lookup_r(haddr, hport, &held);
1119       identP =3D (struct Identity *)rx_GetSpecific(tcon,=20
rxcon_ident_key);
1120       if (host && !identP && !(host->Console & 1)) {
1121           /* This is a new connection, and we already have a host
1122            * structure for this address. Verify that the identity
1123            * of the caller matches the identity in the host structur=
e.
1124            */
1125           h_Lock_r(host);
1126           if (!(host->hostFlags & ALTADDR)) {
1127               /* Another thread is doing initialization */
(dbx) where
current thread: t@15
 [1] h_gethostcps_r(host =3D (nil), now =3D 1150115729), line 490 in "hos=
t.c"
 [2] h_Lookup_r(haddr =3D 2303680610U, hport =3D 7001U, heldp =3D 0xfe2fb=
a54),=20
line 638 in "host.c"
=3D>[3] h_GetHost_r(tcon =3D 0x11f3688), line 1118 in "host.c"
 [4] h_FindClient_r(tcon =3D 0x11f3688), line 1676 in "host.c"
 [5] CallPreamble(acall =3D 0x11a2340, activecall =3D 0, tconn =3D 0xfe2f=
bd34,=20
ahostp =3D 0xfe2fbd30), line 317 in "afsfileprocs.c"
 [6] SRXAFS_GetTime(acall =3D 0x11a2340, Seconds =3D 0xfe2fbdb4, USeconds=
 =3D=20
0xfe2fbdb0), line 6793 in "afsfileprocs.c"
 [7] _RXAFS_GetTime(z_call =3D 0x11a2340, z_xdrs =3D 0xfe2fbe38), line 11=
09=20
in "afsint.ss.c"
 [8] RXAFS_ExecuteRequest(z_call =3D 0x11a2340), line 1941 in "afsint.ss.=
c"
 [9] rxi_ServerProc(threadID =3D 98, newcall =3D (nil), socketp =3D=20
0xfe2fbf34), line 1407 in "rx.c"
 [10] rx_ServerProc(), line 300 in "rx_pthread.c"
 [11] server_entry(argp =3D 0xc84a0), line 98 in "rx_pthread.c"
(dbx) up
Current function is h_FindClient_r
1676           host =3D h_GetHost_r(tcon);       /* Returns it h_Held */
(dbx) dump
tinst =3D ""
client =3D (nil)
expTime =3D 2147483647
created =3D 0
code =3D 1
host =3D 0x543878
fail =3D 0
tname =3D ""
tcell =3D ""
authClass =3D 0
tcon =3D 0x11f3688
viceid =3D 32766
oldClient =3D (nil)
uname =3D ""
(dbx) list
1676           host =3D h_GetHost_r(tcon);       /* Returns it h_Held */
1677
1678       retryfirstclient:
1679           /* First try to find the client structure */
1680           for (client =3D host->FirstClient; client; client =3D=20
client->next) {
1681               if (!client->deleted && (client->sid =3D=3D rxr_CidOf(=
tcon))
1682                   && (client->VenusEpoch =3D=3D rxr_GetEpoch(tcon)))=
 {
1683                   if (client->tcon && (client->tcon !=3D tcon)) {
1684                       ViceLog(0,
1685                               ("*** Vid=3D%d, sid=3D%x, tcon=3D%x,=20
Tcon=3D%x ***\n",
(dbx) up
Current function is CallPreamble
 317       tclient =3D h_FindClient_r(*tconn);
(dbx) where
current thread: t@15
 [1] h_gethostcps_r(host =3D (nil), now =3D 1150115729), line 490 in "hos=
t.c"
 [2] h_Lookup_r(haddr =3D 2303680610U, hport =3D 7001U, heldp =3D 0xfe2fb=
a54),=20
line 638 in "host.c"
 [3] h_GetHost_r(tcon =3D 0x11f3688), line 1118 in "host.c"
 [4] h_FindClient_r(tcon =3D 0x11f3688), line 1676 in "host.c"
=3D>[5] CallPreamble(acall =3D 0x11a2340, activecall =3D 0, tconn =3D=20
0xfe2fbd34, ahostp =3D 0xfe2fbd30), line 317 in "afsfileprocs.c"
 [6] SRXAFS_GetTime(acall =3D 0x11a2340, Seconds =3D 0xfe2fbdb4, USeconds=
 =3D=20
0xfe2fbdb0), line 6793 in "afsfileprocs.c"
 [7] _RXAFS_GetTime(z_call =3D 0x11a2340, z_xdrs =3D 0xfe2fbe38), line 11=
09=20
in "afsint.ss.c"
 [8] RXAFS_ExecuteRequest(z_call =3D 0x11a2340), line 1941 in "afsint.ss.=
c"
 [9] rxi_ServerProc(threadID =3D 98, newcall =3D (nil), socketp =3D=20
0xfe2fbf34), line 1407 in "rx.c"
 [10] rx_ServerProc(), line 300 in "rx_pthread.c"
 [11] server_entry(argp =3D 0xc84a0), line 98 in "rx_pthread.c"
(dbx) dump
hoststr =3D "=FE/=BD\034"
tconn =3D 0xfe2fbd34
hoststr2 =3D ""
thost =3D (nil)
code =3D 0
retry_flag =3D 1
activecall =3D 0
ahostp =3D 0xfe2fbd30
acall =3D 0x11a2340
tclient =3D 0xc31ea8
(dbx) list
 317       tclient =3D h_FindClient_r(*tconn);
 318       thost =3D tclient->host;
 319       if (tclient->prfail =3D=3D 1) { /* couldn't get the CPS */
 320           if (!retry_flag) {
 321               h_ReleaseClient_r(tclient);
 322               h_Release_r(thost);
 323               ViceLog(0, ("CallPreamble: Couldn't get CPS. Fail\n"))=
;
 324               H_UNLOCK;
 325               return -1001;
 326           }
(dbx) up
Current function is SRXAFS_GetTime
6793       if ((code =3D CallPreamble(acall, NOTACTIVECALL, &tcon, &thost=
)))
(dbx) where
current thread: t@15
 [1] h_gethostcps_r(host =3D (nil), now =3D 1150115729), line 490 in "hos=
t.c"
 [2] h_Lookup_r(haddr =3D 2303680610U, hport =3D 7001U, heldp =3D 0xfe2fb=
a54),=20
line 638 in "host.c"
 [3] h_GetHost_r(tcon =3D 0x11f3688), line 1118 in "host.c"
 [4] h_FindClient_r(tcon =3D 0x11f3688), line 1676 in "host.c"
 [5] CallPreamble(acall =3D 0x11a2340, activecall =3D 0, tconn =3D 0xfe2f=
bd34,=20
ahostp =3D 0xfe2fbd30), line 317 in "afsfileprocs.c"
=3D>[6] SRXAFS_GetTime(acall =3D 0x11a2340, Seconds =3D 0xfe2fbdb4, USeco=
nds =3D=20
0xfe2fbdb0), line 6793 in "afsfileprocs.c"
 [7] _RXAFS_GetTime(z_call =3D 0x11a2340, z_xdrs =3D 0xfe2fbe38), line 11=
09=20
in "afsint.ss.c"
 [8] RXAFS_ExecuteRequest(z_call =3D 0x11a2340), line 1941 in "afsint.ss.=
c"
 [9] rxi_ServerProc(threadID =3D 98, newcall =3D (nil), socketp =3D=20
0xfe2fbf34), line 1407 in "rx.c"
 [10] rx_ServerProc(), line 300 in "rx_pthread.c"
 [11] server_entry(argp =3D 0xc84a0), line 98 in "rx_pthread.c"
(dbx) dump
elapsedTime =3D RECORD
opStartTime =3D RECORD
opP =3D 0x167aa4
thost =3D 0x10cae80
code =3D 0
tpl =3D RECORD
USeconds =3D 0xfe2fbdb0
tcon =3D 0x11f3688
opStopTime =3D RECORD
acall =3D 0x11a2340
Seconds =3D 0xfe2fbdb4
(dbx) list
6793       if ((code =3D CallPreamble(acall, NOTACTIVECALL, &tcon, &thost=
)))
6794           goto Bad_GetTime;
6795
6796       FS_LOCK;
6797       AFSCallStats.GetTime++, AFSCallStats.TotalCalls++;
6798       FS_UNLOCK;
6799       TM_GetTimeOfDay(&tpl, 0);
6800       *Seconds =3D tpl.tv_sec;
6801       *USeconds =3D tpl.tv_usec;
6802
(dbx)


And today

bash-2.05$ dbx  fileserver=20
core_jplis-fil-afs32_fileserver_0_0_1150225984_3201
For information about new features see `help changes'
To remove this message, put `dbxenv suppress_startup_message 7.4' in=20
your .dbxrc
Reading fileserver
core file header read successfully
Reading ld.so.1
Reading libpthread.so.1
Reading libsocket.so.1
Reading libresolv.so.2
Reading libnsl.so.1
Reading libintl.so.1
Reading libdl.so.1
Reading libc.so.1
Reading libmp.so.2
Reading libc_psr.so.1
Reading libthread.so.1
Reading nss_files.so.1
WARNING!!
A loadobject was found with an unexpected checksum value.
See `help core mismatch' for details, and run `proc -map'
to see what checksum values were expected and found.
dbx: warning: Some symbolic information might be incorrect.
t@104 (l@104) terminated by signal SEGV (no mapping at the fault address)
Current function is h_gethostcps_r
  490           host->hcpsfailed =3D 0;
(dbx) where
current thread: t@104
=3D>[1] h_gethostcps_r(host =3D (nil), now =3D 1150225984), line 490 in "=
host.c"
  [2] h_Lookup_r(haddr =3D 2313442094U, hport =3D 7001U, heldp =3D=20
0xf89fba54), line 638 in "host.c"
  [3] h_GetHost_r(tcon =3D 0x137fef0), line 1118 in "host.c"
  [4] h_FindClient_r(tcon =3D 0x137fef0), line 1676 in "host.c"
  [5] CallPreamble(acall =3D 0x1267388, activecall =3D 0, tconn =3D=20
0xf89fbd34, ahostp =3D 0xf89fbd30), line 317 in "afsfileprocs.c"
  [6] SRXAFS_GetTime(acall =3D 0x1267388, Seconds =3D 0xf89fbdb4, USecond=
s =3D=20
0xf89fbdb0), line 6793 in "afsfileprocs.c"
  [7] _RXAFS_GetTime(z_call =3D 0x1267388, z_xdrs =3D 0xf89fbe38), line 1=
109=20
in "afsint.ss.c"
  [8] RXAFS_ExecuteRequest(z_call =3D 0x1267388), line 1941 in "afsint.ss=
.c"
  [9] rxi_ServerProc(threadID =3D 50, newcall =3D (nil), socketp =3D=20
0xf89fbf34), line 1407 in "rx.c"
  [10] rx_ServerProc(), line 300 in "rx_pthread.c"
  [11] server_entry(argp =3D 0xc84a0), line 98 in "rx_pthread.c"
(dbx) up
Current function is h_Lookup_r
  638                   h_gethostcps_r(host, now);
(dbx) print host
host =3D 0x11f35d0
(dbx) print *host
*host =3D {
    next           =3D 0x12da4a8
    prev           =3D 0x12e63a0
    callback_rxcon =3D 0x1301108
    holds          =3D (0, 262144, 0, 134217728, 0)
    host           =3D 2313442094U
    port           =3D 7001U
    Console        =3D '\0'
    hostFlags      =3D 69U
    InSameNetwork  =3D '\0'
    dummy          =3D ""
    hcpsfailed     =3D '\0'
    hcps           =3D {
        prlist_len =3D 6U
        prlist_val =3D 0x11c67a8
    }
    LastCall       =3D 1150225384U
    ActiveCall     =3D 1150214360U
    FirstClient    =3D 0x12c2ff0
    cpsCall        =3D 1150225984U
    interface      =3D 0x12991a0
    cblist         =3D 401U
    index          =3D 962U
    lock           =3D {
        wait_states     =3D '\0'
        excl_locked     =3D '\0'
        readers_reading =3D '\0'
        num_waiting     =3D '\0'
        mutex           =3D {
            __pthread_mutex_flags =3D {
                __pthread_mutex_flag1   =3D 4U
                __pthread_mutex_flag2   =3D '\0'
                __pthread_mutex_ceiling =3D '\0'
                __pthread_mutex_type    =3D 0
                __pthread_mutex_magic   =3D 19800U
            }
            __pthread_mutex_lock  =3D {
                __pthread_mutex_lock64  =3D {
                    __pthread_mutex_pad =3D ""
                }
                __pthread_mutex_lock32  =3D {
                    __pthread_ownerpid =3D 0
                    __pthread_lockword =3D 0
                }
                __pthread_mutex_owner64 =3D 0
            }
            __pthread_mutex_data  =3D 0
        }
        read_cv         =3D {
            __pthread_cond_flags =3D {
                __pthread_cond_flag  =3D ""
                __pthread_cond_type  =3D 0
                __pthread_cond_magic =3D 17238U
            }
            __pthread_cond_data  =3D 0
        }
        write_cv        =3D {
            __pthread_cond_flags =3D {
                __pthread_cond_flag  =3D ""
                __pthread_cond_type  =3D 0
                __pthread_cond_magic =3D 17238U
            }
            __pthread_cond_data  =3D 0
        }
    }
    cond           =3D {
        __pthread_cond_flags =3D {
            __pthread_cond_flag  =3D ""
            __pthread_cond_type  =3D 0
            __pthread_cond_magic =3D 17238U
        }
        __pthread_cond_data  =3D 0
    }
}
(dbx) down
Current function is h_gethostcps_r
  490           host->hcpsfailed =3D 0;
(dbx) print host
host =3D (nil)
(dbx) print *host
dbx: reference through nil pointer
(dbx)