diff --git a/dissect/executable/__init__.py b/dissect/executable/__init__.py index 43d2a88..710b000 100644 --- a/dissect/executable/__init__.py +++ b/dissect/executable/__init__.py @@ -1,5 +1,7 @@ from dissect.executable.elf import ELF +from dissect.executable.pdb import PDB __all__ = [ "ELF", + "PDB", ] diff --git a/dissect/executable/pdb/__init__.py b/dissect/executable/pdb/__init__.py new file mode 100644 index 0000000..bdc3c55 --- /dev/null +++ b/dissect/executable/pdb/__init__.py @@ -0,0 +1,5 @@ +from dissect.executable.pdb.pdb import PDB + +__all__ = [ + "PDB", +] diff --git a/dissect/executable/pdb/helpers/__init__.py b/dissect/executable/pdb/helpers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/dissect/executable/pdb/helpers/c_pdb.py b/dissect/executable/pdb/helpers/c_pdb.py new file mode 100644 index 0000000..60a8702 --- /dev/null +++ b/dissect/executable/pdb/helpers/c_pdb.py @@ -0,0 +1,1328 @@ +from dissect.cstruct import cstruct + +pdb_def = """ +///////////////////////////////////////////////////////////////////////// +// PDB generic definitions +///////////////////////////////////////////////////////////////////////// +typedef uint32 OFF; +typedef uint32 CB; +typedef uint16 SN; + +typedef uint32 CV_typ_t; +typedef CV_typ_t TI; +typedef ushort CV_typ16_t; + +struct OffCb { // offset, cb pair + OFF off; + CB cb; +}; + +struct CV_funcattr_t { + uint8 cxxreturnudt :1; // true if C++ style ReturnUDT + uint8 ctor :1; // true if func is an instance constructor + uint8 ctorvbase :1; // true if func is an instance constructor of a class with virtual bases + uint8 unused :5; // unused +}; + +struct DATA_STREAM_V7 { + uint32 stream_size; +}; + +struct ROOT_STREAM_V7 { + uint32 dStreams; + DATA_STREAM_V7 streamLengths[dStreams]; +}; + +struct PDB7_HEADER { + char signature[32]; + uint32 page_size; + uint32 alloc_table_ptr; + uint32 num_file_pages; + uint32 root_size; + uint32 reserved; + uint32 root_page_index; +}; + +struct DATA_STREAM_V2 { + uint32 stream_size; + uint32 reserved; +}; + +struct ROOT_STREAM_V2 { + uint16 dStreams; + uint16 reserved; + DATA_STREAM_V2 streamLengths[dStreams]; // array of page numbers +}; + +struct PDB2_HEADER { + char signature[44]; + uint32 page_size; + uint16 start_page; + uint16 num_file_pages; + uint32 root_size; + uint32 reserved; +}; + +struct PORTABLE_PDB_HEADER { + char signature[4]; + uint16 majorVersion; + uint16 minorVersion; + uint32 reserved; + uint32 versionLength; + char version[versionLength]; +}; + +///////////////////////////////////////////////////////////////////////// +// TPI specific definitions +// +// General overview: https://github.com/microsoft/microsoft-pdb/blob/master/PDB/dbi/tpi.h +///////////////////////////////////////////////////////////////////////// + +enum TPIIMPV { + impv40 = 19950410, + impv41 = 19951122, + impv50Interim = 19960307, + impv50 = 19961031, + impv70 = 19990903, + impv80 = 20040203, +}; + +struct TpiHash { + SN sn; // main hash stream + SN snPad; // auxilliary hash data if necessary + CB cbHashKey; // size of hash key + uint32 cHashBuckets; // how many buckets we have + OffCb offcbHashVals; // offcb of hashvals + OffCb offcbTiOff; // offcb of (TI,OFF) pairs + OffCb offcbHashAdj; // offcb of hash head list, maps (hashval,ti), where ti is the head of the hashval chain. +}; + +struct TpiHeader { // type database header: + TPIIMPV vers; // version which created this TypeServer + CB cbHdr; // size of the header, allows easier upgrading and backwards compatibility + TI tiMin; // lowest TI + TI tiMax; // highest TI + 1 + CB cbGprec; // count of bytes used by the gprec which follows. + TpiHash tpihash; // hash stream schema +}; + +struct TpiType { + uint16 length; + char type_data[length]; +}; + +// Typing +// https://github.com/microsoft/microsoft-pdb/blob/082c5290e5aff028ae84e43affa8be717aa7af73/include/cvinfo.h +// + +enum LEAF_ENUM_e : uint16 { + // leaf indices starting records but referenced from symbol records + + LF_MODIFIER_16t = 0x01, + LF_POINTER_16t = 0x02, + LF_ARRAY_16t = 0x03, + LF_CLASS_16t = 0x04, + LF_STRUCTURE_16t = 0x05, + LF_UNION_16t = 0x06, + LF_ENUM_16t = 0x07, + LF_PROCEDURE_16t = 0x08, + LF_MFUNCTION_16t = 0x09, + LF_VTSHAPE = 0x0a, + LF_COBOL0_16t = 0x0b, + LF_COBOL1 = 0x0c, + LF_BARRAY_16t = 0x0d, + LF_LABEL = 0x0e, + LF_NULL = 0x0f, + LF_NOTTRAN = 0x10, + LF_DIMARRAY_16t = 0x11, + LF_VFTPATH_16t = 0x12, + LF_PRECOMP_16t = 0x13, // not referenced from symbol + LF_ENDPRECOMP = 0x14, // not referenced from symbol + LF_OEM_16t = 0x15, // oem definable type string + LF_TYPESERVER_ST = 0x16, // not referenced from symbol + + // leaf indices starting records but referenced only from type records + + LF_SKIP_16t = 0x0200, + LF_ARGLIST_16t = 0x0201, + LF_DEFARG_16t = 0x0202, + LF_LIST = 0x0203, + LF_FIELDLIST_16t = 0x0204, + LF_DERIVED_16t = 0x0205, + LF_BITFIELD_16t = 0x0206, + LF_METHODLIST_16t = 0x0207, + LF_DIMCONU_16t = 0x0208, + LF_DIMCONLU_16t = 0x0209, + LF_DIMVARU_16t = 0x020a, + LF_DIMVARLU_16t = 0x020b, + LF_REFSYM = 0x020c, + + LF_BCLASS_16t = 0x0400, + LF_VBCLASS_16t = 0x0401, + LF_IVBCLASS_16t = 0x0402, + LF_ENUMERATE_ST = 0x0403, + LF_FRIENDFCN_16t = 0x0404, + LF_INDEX_16t = 0x0405, + LF_MEMBER_16t = 0x0406, + LF_STMEMBER_16t = 0x0407, + LF_METHOD_16t = 0x0408, + LF_NESTTYPE_16t = 0x0409, + LF_VFUNCTAB_16t = 0x040a, + LF_FRIENDCLS_16t = 0x040b, + LF_ONEMETHOD_16t = 0x040c, + LF_VFUNCOFF_16t = 0x040d, + +// 32-bit type index versions of leaves, all have the 0x1000 bit set + + LF_TI16_MAX = 0x1000, + LF_MODIFIER = 0x1001, + LF_POINTER = 0x1002, + LF_ARRAY_ST = 0x1003, + LF_CLASS_ST = 0x1004, + LF_STRUCTURE_ST = 0x1005, + LF_UNION_ST = 0x1006, + LF_ENUM_ST = 0x1007, + LF_PROCEDURE = 0x1008, + LF_MFUNCTION = 0x1009, + LF_COBOL0 = 0x100a, + LF_BARRAY = 0x100b, + LF_DIMARRAY_ST = 0x100c, + LF_VFTPATH = 0x100d, + LF_PRECOMP_ST = 0x100e, // not referenced from symbol + LF_OEM = 0x100f, // oem definable type string + LF_ALIAS_ST = 0x1010, // alias (typedef) type + LF_OEM2 = 0x1011, // oem definable type string + + // leaf indices starting records but referenced only from type records + + LF_SKIP = 0x1200, + LF_ARGLIST = 0x1201, + LF_DEFARG_ST = 0x1202, + LF_FIELDLIST = 0x1203, + LF_DERIVED = 0x1204, + LF_BITFIELD = 0x1205, + LF_METHODLIST = 0x1206, + LF_DIMCONU = 0x1207, + LF_DIMCONLU = 0x1208, + LF_DIMVARU = 0x1209, + LF_DIMVARLU = 0x120a, + + LF_BCLASS = 0x1400, + LF_VBCLASS = 0x1401, + LF_IVBCLASS = 0x1402, + LF_FRIENDFCN_ST = 0x1403, + LF_INDEX = 0x1404, + LF_MEMBER_ST = 0x1405, + LF_STMEMBER_ST = 0x1406, + LF_METHOD_ST = 0x1407, + LF_NESTTYPE_ST = 0x1408, + LF_VFUNCTAB = 0x1409, + LF_FRIENDCLS = 0x140a, + LF_ONEMETHOD_ST = 0x140b, + LF_VFUNCOFF = 0x140c, + LF_NESTTYPEEX_ST = 0x140d, + LF_MEMBERMODIFY_ST = 0x140e, + LF_MANAGED_ST = 0x140f, + + // Types w/ SZ names + + LF_ST_MAX = 0x1500, + LF_TYPESERVER = 0x1501, // not referenced from symbol + LF_ENUMERATE = 0x1502, + LF_ARRAY = 0x1503, + LF_CLASS = 0x1504, + LF_STRUCTURE = 0x1505, + LF_UNION = 0x1506, + LF_ENUM = 0x1507, + LF_DIMARRAY = 0x1508, + LF_PRECOMP = 0x1509, // not referenced from symbol + LF_ALIAS = 0x150a, // alias (typedef) type + LF_DEFARG = 0x150b, + LF_FRIENDFCN = 0x150c, + LF_MEMBER = 0x150d, + LF_STMEMBER = 0x150e, + LF_METHOD = 0x150f, + LF_NESTTYPE = 0x1510, + LF_ONEMETHOD = 0x1511, + LF_NESTTYPEEX = 0x1512, + LF_MEMBERMODIFY = 0x1513, + LF_MANAGED = 0x1514, + LF_TYPESERVER2 = 0x1515, + + LF_STRIDED_ARRAY = 0x1516, // same as LF_ARRAY, but with stride between adjacent elements + LF_HLSL = 0x1517, + LF_MODIFIER_EX = 0x1518, + LF_INTERFACE = 0x1519, + LF_BINTERFACE = 0x151a, + LF_VECTOR = 0x151b, + LF_MATRIX = 0x151c, + + LF_VFTABLE = 0x151d, // a virtual function table + // LF_ENDOFLEAFRECORD = LF_VFTABLE, + + // LF_TYPE_LAST, // one greater than the last type record + // LF_TYPE_MAX = LF_TYPE_LAST - 1, + + LF_FUNC_ID = 0x1601, // global func ID + LF_MFUNC_ID = 0x1602, // member func ID + LF_BUILDINFO = 0x1603, // build info: tool, version, command line, src/pdb file + LF_SUBSTR_LIST = 0x1604, // similar to LF_ARGLIST, for list of sub strings + LF_STRING_ID = 0x1605, // string ID + + LF_UDT_SRC_LINE = 0x1606, // source and line on where an UDT is defined + // only generated by compiler + + LF_UDT_MOD_SRC_LINE = 0x1607, // module, source and line on where an UDT is defined + // only generated by linker + + // LF_ID_LAST, // one greater than the last ID record + // LF_ID_MAX = LF_ID_LAST - 1, + + LF_NUMERIC = 0x8000, + LF_CHAR = 0x8000, + LF_SHORT = 0x8001, + LF_USHORT = 0x8002, + LF_LONG = 0x8003, + LF_ULONG = 0x8004, + LF_REAL32 = 0x8005, + LF_REAL64 = 0x8006, + LF_REAL80 = 0x8007, + LF_REAL128 = 0x8008, + LF_QUADWORD = 0x8009, + LF_UQUADWORD = 0x800a, + LF_REAL48 = 0x800b, + LF_COMPLEX32 = 0x800c, + LF_COMPLEX64 = 0x800d, + LF_COMPLEX80 = 0x800e, + LF_COMPLEX128 = 0x800f, + LF_VARSTRING = 0x8010, + + LF_OCTWORD = 0x8017, + LF_UOCTWORD = 0x8018, + + LF_DECIMAL = 0x8019, + LF_DATE = 0x801a, + LF_UTF8STRING = 0x801b, + + LF_REAL16 = 0x801c, + + LF_PAD0 = 0xf0, + LF_PAD1 = 0xf1, + LF_PAD2 = 0xf2, + LF_PAD3 = 0xf3, + LF_PAD4 = 0xf4, + LF_PAD5 = 0xf5, + LF_PAD6 = 0xf6, + LF_PAD7 = 0xf7, + LF_PAD8 = 0xf8, + LF_PAD9 = 0xf9, + LF_PAD10 = 0xfa, + LF_PAD11 = 0xfb, + LF_PAD12 = 0xfc, + LF_PAD13 = 0xfd, + LF_PAD14 = 0xfe, + LF_PAD15 = 0xff, +}; + +struct CV_prop_t { + USHORT packed :1; // true if structure is packed + USHORT ctor :1; // true if constructors or destructors present + USHORT ovlops :1; // true if overloaded operators present + USHORT isnested :1; // true if this is a nested class + USHORT cnested :1; // true if this class contains nested types + USHORT opassign :1; // true if overloaded assignment (=) + USHORT opcast :1; // true if casting methods + USHORT fwdref :1; // true if forward reference (incomplete defn) + USHORT scoped :1; // scoped definition + USHORT hasuniquename :1; // true if there is a decorated name following the regular name + USHORT sealed :1; // true if class cannot be used as a base class + USHORT hfa :2; // CV_HFA_e + USHORT intrinsic :1; // true if class is an intrinsic type (e.g. __m128d) + USHORT mocom :2; // CV_MOCOM_UDT_e +}; + +// LF_MODIFIER + +struct CV_modifier_t { + unsigned short MOD_const :1; + unsigned short MOD_volatile :1; + unsigned short MOD_unaligned :1; + unsigned short MOD_unused :13; +}; + +struct LF_MODIFIER { + CV_typ_t type; // modified type + CV_modifier_t attr; // modifier attribute modifier_t +}; + +// LF_POINTER + +enum CV_ptrmode_e { + CV_PTR_MODE_PTR = 0x00, // "normal" pointer + CV_PTR_MODE_REF = 0x01, // "old" reference + CV_PTR_MODE_LVREF = 0x01, // l-value reference + CV_PTR_MODE_PMEM = 0x02, // pointer to data member + CV_PTR_MODE_PMFUNC = 0x03, // pointer to member function + CV_PTR_MODE_RVREF = 0x04, // r-value reference + CV_PTR_MODE_RESERVED = 0x05 // first unused pointer mode +}; + +enum CV_ptrtype_e { + CV_PTR_NEAR = 0x00, // 16 bit pointer + CV_PTR_FAR = 0x01, // 16:16 far pointer + CV_PTR_HUGE = 0x02, // 16:16 huge pointer + CV_PTR_BASE_SEG = 0x03, // based on segment + CV_PTR_BASE_VAL = 0x04, // based on value of base + CV_PTR_BASE_SEGVAL = 0x05, // based on segment value of base + CV_PTR_BASE_ADDR = 0x06, // based on address of base + CV_PTR_BASE_SEGADDR = 0x07, // based on segment address of base + CV_PTR_BASE_TYPE = 0x08, // based on type + CV_PTR_BASE_SELF = 0x09, // based on self + CV_PTR_NEAR32 = 0x0a, // 32 bit pointer + CV_PTR_FAR32 = 0x0b, // 16:32 pointer + CV_PTR_64 = 0x0c, // 64 bit pointer + CV_PTR_UNUSEDPTR = 0x0d // first unused pointer type +}; + +struct lfPointerAttr { + CV_ptrtype_e ptrtype :5; // ordinal specifying pointer type (CV_ptrtype_e) + CV_ptrmode_e ptrmode :3; // ordinal specifying pointer mode (CV_ptrmode_e) + unsigned long isflat32 :1; // true if 0:32 pointer + unsigned long isvolatile :1; // TRUE if volatile pointer + unsigned long isconst :1; // TRUE if const pointer + unsigned long isunaligned :1; // TRUE if unaligned pointer + unsigned long isrestrict :1; // TRUE if restricted pointer (allow agressive opts) + unsigned long size :6; // size of pointer (in bytes) + unsigned long ismocom :1; // TRUE if it is a MoCOM pointer (^ or %) + unsigned long islref :1; // TRUE if it is this pointer of member function with & ref-qualifier + unsigned long isrref :1; // TRUE if it is this pointer of member function with && ref-qualifier + unsigned long unused :10; // pad out to 32-bits for following cv_typ_t's +}; + +struct LF_POINTER { + CV_typ_t utype; // type index of the underlying type + lfPointerAttr attr; +}; + +// LF_ARGLIST +struct LF_ARGLIST_16t { + unsigned short count; // number of arguments + CV_typ16_t arg[count]; // number of arguments +}; + +struct LF_ARGLIST { + unsigned long count; // number of arguments + CV_typ_t arg[count]; // number of arguments +}; + +// LF_ARRAY + +struct LF_ARRAY { + CV_typ_t elemtype; // type index of element type + CV_typ_t idxtype; // type index of indexing type + uint16 size; + unsigned char name[]; // variable length data specifying size in bytes and name +}; + +// LF_STRUCTURE + +struct LF_STRUCTURE { + uint16 count; + CV_prop_t property; // property attribute field (prop_t) + CV_typ_t field; // type index of LF_FIELD descriptor list + CV_typ_t derived; // type index of derived from list if not zero + CV_typ_t vshape; // type index of vshape table for this class +}; + +struct LF_STRUCTURE_ST { + uint16 count; + CV_prop_t property; // property attribute field (prop_t) + CV_typ_t field; // type index of LF_FIELD descriptor list + CV_typ_t derived; // type index of derived from list if not zero + CV_typ_t vshape; // type index of vshape table for this class + uint16 size; +}; + +// LF_UNION + +struct LF_UNION { + unsigned short count; // count of number of elements in class + CV_prop_t property; // property attribute field + CV_typ_t field; // type index of LF_FIELD descriptor list + uint16 size; + unsigned char name[]; // variable length data describing length of structure and name +}; + +// LF_FIELDLIST STRUCTS + +enum CV_methodprop_e : uint8 { + CV_MTvanilla = 0x00, + CV_MTvirtual = 0x01, + CV_MTstatic = 0x02, + CV_MTfriend = 0x03, + CV_MTintro = 0x04, + CV_MTpurevirt = 0x05, + CV_MTpureintro = 0x06, +}; + +typedef struct CV_fldattr_t { + unsigned short access :2; // access protection CV_access_t + unsigned short mprop :3; // method properties CV_methodprop_t + unsigned short pseudo :1; // compiler generated fcn and does not exist + unsigned short noinherit :1; // true if class cannot be inherited + unsigned short noconstruct :1; // true if class cannot be constructed + unsigned short compgenx :1; // compiler generated fcn and does exist + unsigned short sealed :1; // true if method cannot be overridden + unsigned short unused :6; // unused +}; + +struct LF_MEMBER { + CV_fldattr_t attr; // attribute mask + CV_typ_t index; // index of type record for field + uint16 offset; + unsigned char name[]; // variable length offset of field followed by length prefixed name of field +}; + +struct LF_STMEMBER { + CV_fldattr_t attr; // attribute mask + CV_typ_t index; // index of type record for field + unsigned char name[]; // variable length offset of field followed by length prefixed name of field +}; + +// LF_ONEMETHOD + +struct LF_ONEMETHOD_HEADER { + CV_fldattr_t attr; // attribute mask + CV_typ_t index; // index of type record for field +} + +struct LF_ONEMETHOD { + CV_fldattr_t attr; // attribute mask + CV_typ_t index; // index of type record for field + uint32 offset; // vtable offset + char name[]; +} + +// LF_ENUMERATE + +struct LF_ENUMERATE { + CV_fldattr_t attr; // access + uint16 value; // Leaf index used to check how many bytes to skip before the name + unsigned char name[]; // variable length value field followed by length prefixed name +}; + +// LF_ENUM + +struct LF_ENUM { + unsigned short count; // count of number of elements in class + CV_prop_t property; // property attribute field + CV_typ_t utype; // underlying type of the enum + CV_typ_t field; // type index of LF_FIELD descriptor list + unsigned char name[]; // length prefixed name of enum +}; + +// LF_PROCEDURE +struct LF_PROCEDURE_16t { + CV_typ16_t rvtype; // type index of return value + unsigned char calltype; // calling convention (CV_call_t) + CV_funcattr_t funcattr; // attributes + unsigned short parmcount; // number of parameters + CV_typ16_t arglist; // type index of argument list +}; + +struct LF_PROCEDURE { + CV_typ_t rvtype; // type index of return value + unsigned char calltype; // calling convention (CV_call_t) + CV_funcattr_t funcattr; // attributes + unsigned short parmcount; // number of parameters + CV_typ_t arglist; // type index of argument list +}; + +// LF_MFUNCTION + +struct LF_MFUNCTION_16t { + CV_typ16_t rvtype; // type index of return value + CV_typ16_t classtype; // type index of containing class + CV_typ16_t thistype; // type index of this pointer (model specific) + unsigned char calltype; // calling convention (call_t) + CV_funcattr_t funcattr; // attributes + unsigned short parmcount; // number of parameters + CV_typ16_t arglist; // type index of argument list + long thisadjust; // this adjuster (long because pad required anyway) +}; + +struct LF_MFUNCTION { + CV_typ_t rvtype; // type index of return value + CV_typ_t classtype; // type index of containing class + CV_typ_t thistype; // type index of this pointer (model specific) + unsigned char calltype; // calling convention (call_t) + CV_funcattr_t funcattr; // attributes + unsigned short parmcount; // number of parameters + CV_typ_t arglist; // type index of argument list + long thisadjust; // this adjuster (long because pad required anyway) +}; + +// LF_CLASS + +struct LF_CLASS_16t { + unsigned short count; // count of number of elements in class + CV_typ16_t field; // type index of LF_FIELD descriptor list + CV_prop_t property; // property attribute field (prop_t) + CV_typ16_t derived; // type index of derived from list if not zero + CV_typ16_t vshape; // type index of vshape table for this class + unsigned char data[]; // data describing length of structure in + // bytes and name +}; + +// typedef LF_CLASS_16t lfStructure_16t; + +struct LF_CLASS { + unsigned short count; // count of number of elements in class + CV_prop_t property; // property attribute field (prop_t) + CV_typ_t field; // type index of LF_FIELD descriptor list + CV_typ_t derived; // type index of derived from list if not zero + CV_typ_t vshape; // type index of vshape table for this class + unsigned char data[]; // data describing length of structure in + // bytes and name +}; + +// typedef LF_CLASS LF_STRUCTURE; +// typedef LF_CLASS lfInterface; + +// LF_METHODLIST + +struct LF_METHOD_16t { + unsigned short count; // number of occurrences of function + CV_typ16_t mList; // index to LF_METHODLIST record + unsigned char Name[]; // length prefixed name of method +}; + +struct LF_METHOD { + unsigned short count; // number of occurrences of function + CV_typ_t mList; // index to LF_METHODLIST record + unsigned char Name[]; // length prefixed name of method +}; + +// LF_VTABLE + +struct LF_VTABLE { + unsigned short count; // number of entries in vfunctable + unsigned char desc[count]; // 4 bit (CV_VTS_desc) descriptors +}; + +// type record for a virtual function table +struct LF_VFTABLE { + CV_typ_t type; // class/structure that owns the vftable + CV_typ_t baseVftable; // vftable from which this vftable is derived + unsigned long offsetInObjectLayout; // offset of the vfptr to this table, relative to the start of the object layout. + unsigned long len; // length of the Names array below in bytes. + unsigned char Names[1]; // array of names. + // The first is the name of the vtable. + // The others are the names of the methods. + // TS-TODO: replace a name with a NamedCodeItem once Weiping is done, to + // avoid duplication of method names. +}; + +// LF_VFUNCTAB + +struct LF_VFUNCTAB { + uint16 unk1; + uint32 table; +}; + +// LF_BCLAS, LF_BINTERFACE + +struct LF_BCLASS { + CV_fldattr_t attr; + uint32 base_class; + uint32 offset; +}; + +// LF_NESTTYPE + +struct LF_NESTTYPE { + CV_fldattr_t attr; + uint32 nested_type; + char name[]; +}; + +// LF_VBCLASS + +struct LF_VBCLASS { + CV_fldattr_t attr; + uint32 base_class; + uint32 base_pointer; + uint16 base_pointer_offset; + uint16 virtual_base_offset; +}; + +struct dynamic_type { + LEAF_ENUM_e type_info; +}; + +enum SYM_ENUM_e : uint16 { + S_COMPILE = 0x0001, // Compile flags symbol + S_REGISTER_16t = 0x0002, // Register variable + S_CONSTANT_16t = 0x0003, // constant symbol + S_UDT_16t = 0x0004, // User defined type + S_SSEARCH = 0x0005, // Start Search + S_END = 0x0006, // Block, procedure, "with" or thunk end + S_SKIP = 0x0007, // Reserve symbol space in $$Symbols table + S_CVRESERVE = 0x0008, // Reserved symbol for CV internal use + S_OBJNAME_ST = 0x0009, // path to object file name + S_ENDARG = 0x000a, // end of argument/return list + S_COBOLUDT_16t = 0x000b, // special UDT for cobol that does not symbol pack + S_MANYREG_16t = 0x000c, // multiple register variable + S_RETURN = 0x000d, // return description symbol + S_ENTRYTHIS = 0x000e, // description of this pointer on entry + + S_BPREL16 = 0x0100, // BP-relative + S_LDATA16 = 0x0101, // Module-local symbol + S_GDATA16 = 0x0102, // Global data symbol + S_PUB16 = 0x0103, // a public symbol + S_LPROC16 = 0x0104, // Local procedure start + S_GPROC16 = 0x0105, // Global procedure start + S_THUNK16 = 0x0106, // Thunk Start + S_BLOCK16 = 0x0107, // block start + S_WITH16 = 0x0108, // with start + S_LABEL16 = 0x0109, // code label + S_CEXMODEL16 = 0x010a, // change execution model + S_VFTABLE16 = 0x010b, // address of virtual function table + S_REGREL16 = 0x010c, // register relative address + + S_BPREL32_16t = 0x0200, // BP-relative + S_LDATA32_16t = 0x0201, // Module-local symbol + S_GDATA32_16t = 0x0202, // Global data symbol + S_PUB32_16t = 0x0203, // a public symbol (CV internal reserved) + S_LPROC32_16t = 0x0204, // Local procedure start + S_GPROC32_16t = 0x0205, // Global procedure start + S_THUNK32_ST = 0x0206, // Thunk Start + S_BLOCK32_ST = 0x0207, // block start + S_WITH32_ST = 0x0208, // with start + S_LABEL32_ST = 0x0209, // code label + S_CEXMODEL32 = 0x020a, // change execution model + S_VFTABLE32_16t = 0x020b, // address of virtual function table + S_REGREL32_16t = 0x020c, // register relative address + S_LTHREAD32_16t = 0x020d, // local thread storage + S_GTHREAD32_16t = 0x020e, // global thread storage + S_SLINK32 = 0x020f, // static link for MIPS EH implementation + + S_LPROCMIPS_16t = 0x0300, // Local procedure start + S_GPROCMIPS_16t = 0x0301, // Global procedure start + + // if these ref symbols have names following then the names are in ST format + S_PROCREF_ST = 0x0400, // Reference to a procedure + S_DATAREF_ST = 0x0401, // Reference to data + S_ALIGN = 0x0402, // Used for page alignment of symbols + + S_LPROCREF_ST = 0x0403, // Local Reference to a procedure + S_OEM = 0x0404, // OEM defined symbol + + // sym records with 32-bit types embedded instead of 16-bit + // all have 0x1000 bit set for easy identification + // only do the 32-bit target versions since we don't really + // care about 16-bit ones anymore. + S_TI16_MAX = 0x1000, + + S_REGISTER_ST = 0x1001, // Register variable + S_CONSTANT_ST = 0x1002, // constant symbol + S_UDT_ST = 0x1003, // User defined type + S_COBOLUDT_ST = 0x1004, // special UDT for cobol that does not symbol pack + S_MANYREG_ST = 0x1005, // multiple register variable + S_BPREL32_ST = 0x1006, // BP-relative + S_LDATA32_ST = 0x1007, // Module-local symbol + S_GDATA32_ST = 0x1008, // Global data symbol + S_PUB32_ST = 0x1009, // a public symbol (CV internal reserved) + S_LPROC32_ST = 0x100a, // Local procedure start + S_GPROC32_ST = 0x100b, // Global procedure start + S_VFTABLE32 = 0x100c, // address of virtual function table + S_REGREL32_ST = 0x100d, // register relative address + S_LTHREAD32_ST = 0x100e, // local thread storage + S_GTHREAD32_ST = 0x100f, // global thread storage + + S_LPROCMIPS_ST = 0x1010, // Local procedure start + S_GPROCMIPS_ST = 0x1011, // Global procedure start + + S_FRAMEPROC = 0x1012, // extra frame and proc information + S_COMPILE2_ST = 0x1013, // extended compile flags and info + + // new symbols necessary for 16-bit enumerates of IA64 registers + // and IA64 specific symbols + + S_MANYREG2_ST = 0x1014, // multiple register variable + S_LPROCIA64_ST = 0x1015, // Local procedure start (IA64) + S_GPROCIA64_ST = 0x1016, // Global procedure start (IA64) + + // Local symbols for IL + S_LOCALSLOT_ST = 0x1017, // local IL sym with field for local slot index + S_PARAMSLOT_ST = 0x1018, // local IL sym with field for parameter slot index + + S_ANNOTATION = 0x1019, // Annotation string literals + + // symbols to support managed code debugging + S_GMANPROC_ST = 0x101a, // Global proc + S_LMANPROC_ST = 0x101b, // Local proc + S_RESERVED1 = 0x101c, // reserved + S_RESERVED2 = 0x101d, // reserved + S_RESERVED3 = 0x101e, // reserved + S_RESERVED4 = 0x101f, // reserved + S_LMANDATA_ST = 0x1020, + S_GMANDATA_ST = 0x1021, + S_MANFRAMEREL_ST= 0x1022, + S_MANREGISTER_ST= 0x1023, + S_MANSLOT_ST = 0x1024, + S_MANMANYREG_ST = 0x1025, + S_MANREGREL_ST = 0x1026, + S_MANMANYREG2_ST= 0x1027, + S_MANTYPREF = 0x1028, // Index for type referenced by name from metadata + S_UNAMESPACE_ST = 0x1029, // Using namespace + + // Symbols w/ SZ name fields. All name fields contain utf8 encoded strings. + S_ST_MAX = 0x1100, // starting point for SZ name symbols + + S_OBJNAME = 0x1101, // path to object file name + S_THUNK32 = 0x1102, // Thunk Start + S_BLOCK32 = 0x1103, // block start + S_WITH32 = 0x1104, // with start + S_LABEL32 = 0x1105, // code label + S_REGISTER = 0x1106, // Register variable + S_CONSTANT = 0x1107, // constant symbol + S_UDT = 0x1108, // User defined type + S_COBOLUDT = 0x1109, // special UDT for cobol that does not symbol pack + S_MANYREG = 0x110a, // multiple register variable + S_BPREL32 = 0x110b, // BP-relative + S_LDATA32 = 0x110c, // Module-local symbol + S_GDATA32 = 0x110d, // Global data symbol + S_PUB32 = 0x110e, // a public symbol (CV internal reserved) + S_LPROC32 = 0x110f, // Local procedure start + S_GPROC32 = 0x1110, // Global procedure start + S_REGREL32 = 0x1111, // register relative address + S_LTHREAD32 = 0x1112, // local thread storage + S_GTHREAD32 = 0x1113, // global thread storage + + S_LPROCMIPS = 0x1114, // Local procedure start + S_GPROCMIPS = 0x1115, // Global procedure start + S_COMPILE2 = 0x1116, // extended compile flags and info + S_MANYREG2 = 0x1117, // multiple register variable + S_LPROCIA64 = 0x1118, // Local procedure start (IA64) + S_GPROCIA64 = 0x1119, // Global procedure start (IA64) + S_LOCALSLOT = 0x111a, // local IL sym with field for local slot index + S_PARAMSLOT = 0x111b, // local IL sym with field for parameter slot index + + // symbols to support managed code debugging + S_LMANDATA = 0x111c, + S_GMANDATA = 0x111d, + S_MANFRAMEREL = 0x111e, + S_MANREGISTER = 0x111f, + S_MANSLOT = 0x1120, + S_MANMANYREG = 0x1121, + S_MANREGREL = 0x1122, + S_MANMANYREG2 = 0x1123, + S_UNAMESPACE = 0x1124, // Using namespace + + // ref symbols with name fields + S_PROCREF = 0x1125, // Reference to a procedure + S_DATAREF = 0x1126, // Reference to data + S_LPROCREF = 0x1127, // Local Reference to a procedure + S_ANNOTATIONREF = 0x1128, // Reference to an S_ANNOTATION symbol + S_TOKENREF = 0x1129, // Reference to one of the many MANPROCSYM's + + // continuation of managed symbols + S_GMANPROC = 0x112a, // Global proc + S_LMANPROC = 0x112b, // Local proc + + // short, light-weight thunks + S_TRAMPOLINE = 0x112c, // trampoline thunks + S_MANCONSTANT = 0x112d, // constants with metadata type info + + // native attributed local/parms + S_ATTR_FRAMEREL = 0x112e, // relative to virtual frame ptr + S_ATTR_REGISTER = 0x112f, // stored in a register + S_ATTR_REGREL = 0x1130, // relative to register (alternate frame ptr) + S_ATTR_MANYREG = 0x1131, // stored in >1 register + + // Separated code (from the compiler) support + S_SEPCODE = 0x1132, + + S_LOCAL_2005 = 0x1133, // defines a local symbol in optimized code + S_DEFRANGE_2005 = 0x1134, // defines a single range of addresses in which symbol can be evaluated + S_DEFRANGE2_2005 = 0x1135, // defines ranges of addresses in which symbol can be evaluated + + S_SECTION = 0x1136, // A COFF section in a PE executable + S_COFFGROUP = 0x1137, // A COFF group + S_EXPORT = 0x1138, // A export + + S_CALLSITEINFO = 0x1139, // Indirect call site information + S_FRAMECOOKIE = 0x113a, // Security cookie information + + S_DISCARDED = 0x113b, // Discarded by LINK /OPT:REF (experimental, see richards) + + S_COMPILE3 = 0x113c, // Replacement for S_COMPILE2 + S_ENVBLOCK = 0x113d, // Environment block split off from S_COMPILE2 + + S_LOCAL = 0x113e, // defines a local symbol in optimized code + S_DEFRANGE = 0x113f, // defines a single range of addresses in which symbol can be evaluated + S_DEFRANGE_SUBFIELD = 0x1140, // ranges for a subfield + + S_DEFRANGE_REGISTER = 0x1141, // ranges for en-registered symbol + S_DEFRANGE_FRAMEPOINTER_REL = 0x1142, // range for stack symbol. + S_DEFRANGE_SUBFIELD_REGISTER = 0x1143, // ranges for en-registered field of symbol + S_DEFRANGE_FRAMEPOINTER_REL_FULL_SCOPE = 0x1144, // range for stack symbol span valid full scope of function body, gap might apply. + S_DEFRANGE_REGISTER_REL = 0x1145, // range for symbol address as register + offset. + + // S_PROC symbols that reference ID instead of type + S_LPROC32_ID = 0x1146, + S_GPROC32_ID = 0x1147, + S_LPROCMIPS_ID = 0x1148, + S_GPROCMIPS_ID = 0x1149, + S_LPROCIA64_ID = 0x114a, + S_GPROCIA64_ID = 0x114b, + + S_BUILDINFO = 0x114c, // build information. + S_INLINESITE = 0x114d, // inlined function callsite. + S_INLINESITE_END = 0x114e, + S_PROC_ID_END = 0x114f, + + S_DEFRANGE_HLSL = 0x1150, + S_GDATA_HLSL = 0x1151, + S_LDATA_HLSL = 0x1152, + + S_FILESTATIC = 0x1153, + + S_LOCAL_DPC_GROUPSHARED = 0x1154, // DPC groupshared variable + S_LPROC32_DPC = 0x1155, // DPC local procedure start + S_LPROC32_DPC_ID = 0x1156, + S_DEFRANGE_DPC_PTR_TAG = 0x1157, // DPC pointer tag definition range + S_DPC_SYM_TAG_MAP = 0x1158, // DPC pointer tag value to symbol record map + + S_ARMSWITCHTABLE = 0x1159, + S_CALLEES = 0x115a, + S_CALLERS = 0x115b, + S_POGODATA = 0x115c, + S_INLINESITE2 = 0x115d, // extended inline site information + + S_HEAPALLOCSITE = 0x115e, // heap allocation site + + S_MOD_TYPEREF = 0x115f, // only generated at link time + + S_REF_MINIPDB = 0x1160, // only generated at link time for mini PDB + S_PDBMAP = 0x1161, // only generated at link time for mini PDB + + S_GDATA_HLSL32 = 0x1162, + S_LDATA_HLSL32 = 0x1163, + + S_GDATA_HLSL32_EX = 0x1164, + S_LDATA_HLSL32_EX = 0x1165, + + S_RECTYPE_MAX = 0x1166, // one greater than last -> manually set for dissect.cstruct + S_RECTYPE_LAST = 0x1166 - 1, + S_RECTYPE_PAD = 0x1166 + 0x100 // Used *only* to verify symbol record types so that current PDB code can potentially read + // future PDBs (assuming no format change, etc). +}; + +///////////////////////////////////////////////////////////////////////// +// DBI specific definitions +// https://github.com/microsoft/microsoft-pdb/blob/master/PDB/dbi/dbi.h +// https://github.com/ungoogled-software/syzygy/blob/master/syzygy/pdb/pdb_data.h +///////////////////////////////////////////////////////////////////////// +struct DbiSectionContrib { + int16_t section; + int16_t pad1; + int32_t offset; + int32_t size; + uint32_t flags; + int16_t module; + int16_t pad2; + uint32_t data_crc; + uint32_t reloc_crc; +}; + +struct DbiModuleInfoBase { + uint32_t opened; + DbiSectionContrib section; + uint16_t flags; + int16_t stream; + uint32_t symbol_bytes; + uint32_t old_lines_bytes; + uint32_t lines_bytes; + int16_t num_files; + uint16_t padding; + uint32_t offsets; + uint32_t num_source; + uint32_t num_compiler; + char module_name[]; + char object_name[]; + // There are two trailing null-terminated 8-bit strings, the first being the + // module_name and the second being the object_name. Then this structure is + // padded with zeros to have a length that is a multiple of 4. +}; + +struct DbiSectionMapItem { + uint8_t flags; + uint8_t section_type; + // This field hasn't been deciphered but it is always 0x00000000 or 0xFFFFFFFF + // and modifying it doesn't seem to invalidate the PDB. + uint16_t unknown_data_1[2]; + uint16_t section_number; + // Same thing as for unknown_data_1. + uint16_t unknown_data_2[2]; + // Value added to the address offset when calculating the RVA. + uint32_t rva_offset; + uint32_t section_length; +}; + +enum header_signature { + hdrSignature = -1, +}; + +struct DbiHeader { + ULONG verSignature; + ULONG verHdr; + ULONG age; + + SN snGSSyms; + + union { + struct { + USHORT usVerPdbDllMin : 8; // minor version + USHORT usVerPdbDllMaj : 7; // major version + USHORT fNewVerFmt : 1; // flag telling us we have rbld stored elsewhere (high bit of original major version) # noqa: E501 + } vernew; // that built this pdb last. + struct { + USHORT usVerPdbDllRBld: 4; + USHORT usVerPdbDllMin : 7; + USHORT usVerPdbDllMaj : 5; + } verold; + USHORT usVerAll; + }; + + SN snPSSyms; + USHORT usVerPdbDllBuild; // build version of the pdb dll that built this pdb last. + SN snSymRecs; + USHORT usVerPdbDllRBld; // rbld version of the pdb dll that built this pdb last. + CB cbGpModi; // size of rgmodi substream + CB cbSC; // size of Section Contribution substream + CB cbSecMap; + CB cbFileInfo; + + CB cbTSMap; // size of the Type Server Map substream + ULONG iMFC; // index of MFC type server + CB cbDbgHdr; // size of optional DbgHdr info appended to the end of the stream + CB cbECInfo; // number of bytes in EC substream, or 0 if EC no EC enabled Mods + struct _flags { + USHORT fIncLink:1; // true if linked incrmentally (really just if ilink thunks are present) + USHORT fStripped:1; // true if PDB::CopyTo stripped the private data out + USHORT fCTypes:1; // true if linked with /debug:ctypes + USHORT unused:13; // reserved, must be 0. + } flags; + USHORT wMachine; // machine type + ULONG rgulReserved[1]; // pad out to 64 bytes for future growth. +}; + +struct SymbolRecordHeader { + // Length of the symbol record in bytes, without this field. The length + // including this field is always a multiple of 4. + uint16_t length; + + // Type of the symbol record. If must be a value from Microsoft_Cci_Pdb::SYM. + SYM_ENUM_e type; +}; + + +enum CVPSF : uint32 { + CVPSF_CODE = 0x1, + CVPSF_FUNCTION = 0x2, + CVPSF_MANAGED = 0x4, + CVPSF_MSIL = 0x8, +}; + +enum Variant : uint16 { + uint8 = 0, + uint16 = 1, + uint32 = 2, + uint64 = 3, + int8 = 4, + int16 = 5, + int32 = 6, + int64 = 7, +}; + +// SYMBOL STRUCTURES + +struct GlobalSymbol { + CVPSF cvpsf_type; + uint32 offset; // The memory offset relative from the start of the section's memory. + uint16 section; // The index of the section in the PDB's section headers list, incremented by `1`. + char name[]; +}; + +struct PublicSymbol { + CVPSF cvpsf_type; + uint32 offset; + uint16 section; + char name[]; +}; + +struct PublicSymbol_ST { + CVPSF cvpsf_type; + uint32 offset; + uint16 section; + uint8 name_length; + char name[name_length]; +}; + + +// ConstantSymbol +struct ConstantSymbolHeader { + uint32 type_index; + uint16 value; +}; + +struct ConstantSymbol { + uint32 type_index; + char value[]; + char name[]; +}; + + +struct RegisterSymbol { + uint32 type_index; + uint16 register; + char name[]; +}; + +struct GlobalDataSymbol { + uint32 type_index; + uint32 offset; + uint16 section; + char name[]; +}; + +struct ManagedDataSymbol { + uint32 type_index; + uint32 offset; + uint16 section; + char name[]; +} + +struct ProcedureReferenceSymbol{ + uint32 sum_name; + uint32 symbol_index; + uint16 module_index; // Index of the module containing the symbol + char name[]; +}; + +struct DataReferenceSymbol { + uint32 sum_name; + uint32 symbol_index; + uint32 module; + char name[]; +}; + +struct AnnotationReferenceSymbol { + uint32 sum_name; + uint32 symbol_index; + uint16 module; + char name[]; +}; + +typedef enum TrampolineType : uint16 { + Incremental = 0x0, + BranchIsland = 0x1, + Unknown = 0x2, +}; + +struct TrampolineSymbol { + TrampolineType trampoline_type; + uint16 size; + uint32 thunk_offset; // The memory offset relative from the start of the section's memory. + uint16 thunk_section; // The index of the section in the PDB's section headers list, incremented by `1`. + uint32 target_offset; // The target memory offset relative from the start of the section's memory. + uint16 target_section; // The target index of the section in the PDB's section headers list, incremented by `1`. +}; + +struct UserDefinedSymbol { + uint32 type_index; + char name[]; +}; + +struct ThreadStorageSymbol { + uint32 type_index; + uint32 offset; + uint16 section; + char name[]; +}; + +struct TokenReferenceSymbol { + uint32 unk1; + uint32 symbol_index; + uint16 module_index; + char name[]; // The token ID +}; + +typedef enum CV_PROCFLAGS : uint8 { + CV_PFLAG_NOFPO = 0x01, + CV_PFLAG_INT = 0x02, + CV_PFLAG_FAR = 0x04, + CV_PFLAG_NEVER = 0x08, + CV_PFLAG_NOTREACHED = 0x10, + CV_PFLAG_CUST_CALL = 0x20, + CV_PFLAG_NOINLINE = 0x40, + CV_PFLAG_OPTDBGINFO = 0x80, +}; + +struct ProcedureSymbol { + uint32 parent; + uint32 end; + uint32 next; + uint32 length; + uint32 debug_start_offset; + uint32 debug_end_offset; + uint32 type_index; + uint32 offset; + uint16 section; + CV_PROCFLAGS flags; + char name[]; +}; +""" + + +c_pdb = cstruct() +c_pdb.load(pdb_def) + + +cv_info_def = """ +struct GUID { + DWORD Data1; + WORD Data2; + WORD Data3; + char Data4[8]; +}; + +struct CV_INFO_PDB70 { + DWORD CvSignature; + GUID Signature; // unique identifier + DWORD Age; // an always-incrementing value + char PdbFileName[]; // zero terminated string with the name of the PDB file +}; +""" + + +cv_info_struct = cstruct() +cv_info_struct.load(cv_info_def) + + +# Types that were gathered from creating some PDB's using Visual Studio +COMPILER_TYPES = { + 0x8: c_pdb.uint32, # HRESULT + 0x10: c_pdb.char, # __int8 / signed char + 0x11: c_pdb.short, + 0x12: c_pdb.int32, # LONG + 0x13: c_pdb.int64, # LONGLONG + 0x14: c_pdb.int128, + 0x20: c_pdb.uchar, # byte + 0x21: c_pdb.WORD, # WORD + 0x22: c_pdb.uint32, # ULONG + 0x23: c_pdb.uint64, # ULONGLONG / QWORD + 0x24: c_pdb.uint128, + 0x30: c_pdb.uint32, # unsigned long long + 0x40: c_pdb.float, + 0x41: c_pdb.double, + 0x42: c_pdb.char[10], # dt type (float64 10 bytes) + 0x45: c_pdb.float, # float32pp + 0x46: c_pdb.float16, + 0x68: c_pdb.int8, + 0x69: c_pdb.uint8, + 0x70: c_pdb.char, # CHAR + 0x71: c_pdb.wchar, + 0x72: c_pdb.int16, + 0x73: c_pdb.uint16, + 0x74: c_pdb.int, # INT + 0x75: c_pdb.uint32, # DWORD32 / unsigned int + 0x76: c_pdb.int64, # LONGLONG + 0x77: c_pdb.uint64, # ULONGLONG + 0x78: c_pdb.int128, + 0x79: c_pdb.uint128, + 0x7A: c_pdb.uint64, # ??? + 0x7B: c_pdb.uint64, # ??? + 0x48CA: c_pdb.uint64, # + 0x1B1511: c_pdb.uint64, # ??? +} + +# Specific pointer types +# 0x47b +POINTER_TYPES = { + 0x410: c_pdb.int8, # PINT8 + 0x411: c_pdb.short, # piVal + 0x412: c_pdb.long, # plVal + 0x413: c_pdb.LONGLONG, # pllVal + 0x420: c_pdb.char, # char* + 0x421: c_pdb.ushort, # puiVal + 0x422: c_pdb.ULONG, # pulVal + 0x423: c_pdb.uint64, # pUint64 + 0x440: c_pdb.float, # pfltVal + 0x441: c_pdb.double, # pdblVal + 0x470: c_pdb.char, # char* + 0x471: c_pdb.char, # LPSTR + 0x474: c_pdb.uint32, + 0x475: c_pdb.UINT, # puintVal + 0x610: c_pdb.int8, + 0x611: c_pdb.short, # piVal + 0x612: c_pdb.LONG, # long* + 0x613: c_pdb.QWORD, # quad* + 0x620: c_pdb.uchar, # uchar* + 0x621: c_pdb.ushort, # ushort* + 0x622: c_pdb.ULONG, # ulong* + 0x623: c_pdb.uint64, # uquad* + 0x630: c_pdb.uint64, + 0x640: c_pdb.float, # pfltVal + 0x641: c_pdb.double, # pdblVal + 0x670: c_pdb.char, # rchar* + 0x671: c_pdb.LONG, # LONG pointer + 0x674: c_pdb.int, # pIntVal + 0x675: c_pdb.DWORD, # PDWORD32 / PUHALF_PTR / PUINT / PUINT32 / PULONG32 + 0x67A: c_pdb.char[2], # _Ptr >::_Bxty> + 0x67B: c_pdb.char[4], # _Ptr >::_Bxty> +} + +# Either unsupported by cstruct at the time of writing or architecture specific +ARCH_POINTERS = [ + 0x3, # VOID + 0x103, # std::nullptr_t + 0x403, # PVOID + 0x47A, # _Ptr + 0x47B, # _Ptr + 0x603, # VOID +] + +# This translation is used to translate the leaf data type for constant symbols +# These relate to integer values, but we're reading the amount of bytes in our case +leaf_translation = { + 0x8000: c_pdb.char, + 0x8001: c_pdb.char[2], + 0x8002: c_pdb.char[2], + 0x8003: c_pdb.char[4], + 0x8004: c_pdb.char[4], + 0x8005: c_pdb.char[4], + 0x8006: c_pdb.char[8], + 0x8007: c_pdb.char[10], + 0x8008: c_pdb.char[16], + 0x8009: c_pdb.char[8], + 0x800A: c_pdb.char[8], + 0x800B: c_pdb.char[6], + 0x800C: c_pdb.char[8], + 0x800D: c_pdb.char[16], + 0x800E: c_pdb.char[20], + 0x800F: c_pdb.char[32], + 0x8010: c_pdb.char, + 0x8017: c_pdb.char[16], + 0x8018: c_pdb.char[16], + 0x8019: c_pdb.char[14], + 0x801A: c_pdb.char[8], + 0x801B: c_pdb.char[None], + 0x801C: c_pdb.char[2], +} + + +PDB2_SIGNATURE = b"Microsoft C/C++ program database 2.00\r\n\x1aJG\x00\x00" +PDB7_SIGNATURE = b"Microsoft C/C++ MSF 7.00\r\n\x1ADS\x00\x00\x00" diff --git a/dissect/executable/pdb/helpers/dbi.py b/dissect/executable/pdb/helpers/dbi.py new file mode 100644 index 0000000..c333ed4 --- /dev/null +++ b/dissect/executable/pdb/helpers/dbi.py @@ -0,0 +1,218 @@ +from io import BytesIO +from typing import BinaryIO, Generator + +# External imports +from dissect.cstruct import cstruct + +from dissect.executable.pdb.helpers.c_pdb import c_pdb, leaf_translation +from dissect.executable.pdb.helpers.pagestream import PageStream + +# Local imports +from dissect.executable.pdb.helpers.utils import retain_file_offset + + +def parse_userdefined_symbol(symbol_data: BinaryIO) -> cstruct: + """Parse the symbols for the user defined types. + + The user defined types need to be reparsed from the beginning of the `symbol_data`. + + Args: + symbol_data: The raw data of the symbol to be parsed. + + Returns: + The symbol as a `cstruct`. + """ + + symbol_data.seek(0) + return c_pdb.UserDefinedSymbol(symbol_data) + + +def parse_constant_symbol(symbol_data: BinaryIO) -> cstruct: + """Parse the symbols for constant types. + + The constant types are a little bit weird as they're parsed in different ways depending on the value field. + + Args: + symbol_data: The raw data of the symbol to be parsed. + + Returns: + The symbol as a `cstruct`. + """ + + header = c_pdb.ConstantSymbolHeader(symbol_data) + + # Define an empty struct for the constant symbol type + symbol = c_pdb.ConstantSymbol() + symbol.type_index = header.type_index + + if header.value & 0x8000: + # Depending on the value this needs to be parsed as a specific data type + leaf_type = c_pdb.LEAF_ENUM_e(header.value) + symbol.value = leaf_translation[leaf_type.value](symbol_data) + else: + symbol.value = header.value.to_bytes(2, "little") + + symbol.name = c_pdb.char[None](symbol_data) + + return symbol + + +class DBI: + """Class for parsing the DBI stream of a PDB file. + + Attributes: + SYMBOL_STRUCTS: A dictionary containing the different `cstruct` definitions or functions defined for parsing a + certain symbol type. + + Args: + streams: The list with `PageStream` entries for this PDB. + """ + + SYMBOL_STRUCTS = { + # PublicSymbol + c_pdb.SYM_ENUM_e.S_PUB32: c_pdb.PublicSymbol, + c_pdb.SYM_ENUM_e.S_PUB32_ST: c_pdb.PublicSymbol_ST, + # ConstantSymbol + c_pdb.SYM_ENUM_e.S_CONSTANT: parse_constant_symbol, + c_pdb.SYM_ENUM_e.S_CONSTANT_ST: parse_constant_symbol, + # RegisterSymbol + c_pdb.SYM_ENUM_e.S_REGISTER: c_pdb.RegisterSymbol, + c_pdb.SYM_ENUM_e.S_REGISTER_ST: c_pdb.RegisterSymbol, + c_pdb.SYM_ENUM_e.S_MANYREG: None, + c_pdb.SYM_ENUM_e.S_MANYREG_ST: None, + c_pdb.SYM_ENUM_e.S_MANYREG2: None, + c_pdb.SYM_ENUM_e.S_MANYREG2_ST: None, + # GlobalDataSymbol + c_pdb.SYM_ENUM_e.S_GDATA32: c_pdb.GlobalDataSymbol, + c_pdb.SYM_ENUM_e.S_GDATA32_ST: c_pdb.GlobalDataSymbol, + c_pdb.SYM_ENUM_e.S_GMANDATA: c_pdb.GlobalDataSymbol, + c_pdb.SYM_ENUM_e.S_GMANDATA_ST: c_pdb.GlobalDataSymbol, + # ManagedDataSymbol + c_pdb.SYM_ENUM_e.S_LDATA32: c_pdb.ManagedDataSymbol, + c_pdb.SYM_ENUM_e.S_LDATA32_ST: c_pdb.ManagedDataSymbol, + c_pdb.SYM_ENUM_e.S_LMANDATA: c_pdb.ManagedDataSymbol, + c_pdb.SYM_ENUM_e.S_LMANDATA_ST: c_pdb.ManagedDataSymbol, + # ProcedureReferenceSymbol + c_pdb.SYM_ENUM_e.S_PROCREF: c_pdb.ProcedureReferenceSymbol, + c_pdb.SYM_ENUM_e.S_PROCREF_ST: c_pdb.ProcedureReferenceSymbol, + c_pdb.SYM_ENUM_e.S_LPROCREF: c_pdb.ProcedureReferenceSymbol, + c_pdb.SYM_ENUM_e.S_LPROCREF_ST: c_pdb.ProcedureReferenceSymbol, + # DataReferenceSymbol + c_pdb.SYM_ENUM_e.S_DATAREF: c_pdb.DataReferenceSymbol, + c_pdb.SYM_ENUM_e.S_DATAREF_ST: c_pdb.DataReferenceSymbol, + # AnnotationReferenceSymbol + c_pdb.SYM_ENUM_e.S_ANNOTATIONREF: c_pdb.AnnotationReferenceSymbol, + # TrampolineSymbol + c_pdb.SYM_ENUM_e.S_TRAMPOLINE: c_pdb.TrampolineSymbol, + # UserDefinedSymbol + c_pdb.SYM_ENUM_e.S_UDT: parse_userdefined_symbol, + c_pdb.SYM_ENUM_e.S_UDT_ST: parse_userdefined_symbol, + # ThreadStorageSymbol + c_pdb.SYM_ENUM_e.S_GTHREAD32: c_pdb.ThreadStorageSymbol, + c_pdb.SYM_ENUM_e.S_GTHREAD32_ST: c_pdb.ThreadStorageSymbol, + c_pdb.SYM_ENUM_e.S_LTHREAD32: c_pdb.ThreadStorageSymbol, + c_pdb.SYM_ENUM_e.S_LTHREAD32_ST: c_pdb.ThreadStorageSymbol, + # ProcedureSymbol + c_pdb.SYM_ENUM_e.S_GPROC32: c_pdb.ProcedureSymbol, + c_pdb.SYM_ENUM_e.S_GPROC32_ST: c_pdb.ProcedureSymbol, + c_pdb.SYM_ENUM_e.S_LPROC32: c_pdb.ProcedureSymbol, + c_pdb.SYM_ENUM_e.S_LPROC32_ST: c_pdb.ProcedureSymbol, + c_pdb.SYM_ENUM_e.S_LPROC32_DPC: c_pdb.ProcedureSymbol, + c_pdb.SYM_ENUM_e.S_GPROC32_ID: c_pdb.ProcedureSymbol, + c_pdb.SYM_ENUM_e.S_LPROC32_ID: c_pdb.ProcedureSymbol, + c_pdb.SYM_ENUM_e.S_LPROC32_DPC_ID: c_pdb.ProcedureSymbol, + # TokenReferenceSymbol + c_pdb.SYM_ENUM_e.S_TOKENREF: c_pdb.TokenReferenceSymbol, + } + + def __init__(self, streams: list[PageStream]): + self.stream = streams[3] + self.header = c_pdb.DbiHeader(self.stream) + + self.symbol_stream = streams[self.header.snSymRecs] + self.symbols = dict() + self.module_info_list = [] + self.section_map_items = [] + + def parse_info(self): + """Parse the symbol information that is present within the PDB file.""" + + module_info_offset = len(self.header) + self._parse_module_info(offset=module_info_offset, dbi_stream=self.stream) + + section_map_offset = len(self.header) + self.header.cbGpModi + self.header.cbSC + self._parse_section_maps(offset=section_map_offset, dbi_stream=self.stream) + + self._parse_symbols() + + def _parse_module_info(self, offset: int, dbi_stream: BinaryIO): + """Function to parse the module information, this structure contains the module names and objects. + + Args: + offset: The offset from which to start reading the module information structures. + dbi_stream: A file-like object of the DBI stream to be parsed. + """ + + dbi_stream.seek(offset) + module_info_end = offset + self.header.cbGpModi + + offset = dbi_stream.tell() + while offset < module_info_end: + if offset % 4 != 0: + dbi_stream.seek(offset + (4 - (offset % 4))) + + module_info_base = c_pdb.DbiModuleInfoBase(dbi_stream) + if module_info_base.stream != -1: + self.module_info_list.append(module_info_base) + + offset = dbi_stream.tell() + + def _parse_section_maps(self, offset: int, dbi_stream: BinaryIO): + """Function to parse the section maps within a PDB file. + + Args: + offset: The offset from which to start reading the section maps structures. + dbi_stream: A file-like object of the DBI stream to be parsed. + """ + + dbi_stream.seek(offset) + section_map_end = offset + self.header.cbSecMap + + while offset < section_map_end: + dbi_section_map_item = c_pdb.DbiSectionMapItem(dbi_stream) + self.section_map_items.append(dbi_section_map_item) + + offset = dbi_stream.tell() + + def _parse_symbols(self) -> dict: + """Parse the symbols as a dictionary. And set this attribute for the class. + + Returns: + A `dict` containing of the symbols within the PDB file. + """ + + for symbol in self.parse_symbols(): + if symbol and symbol.name: + self.symbols[symbol.name.decode()] = symbol + + def parse_symbols(self) -> Generator[cstruct, None, None]: + """Function to parse the symbols defined in the PDB. + + Yields: + The symbols that were found as `cstruct` objects. + """ + + offset = self.symbol_stream.tell() + with retain_file_offset(fobj=self.symbol_stream, offset=offset): + while True: + try: + # Read the symbol record header to establish the right struct to use + symbol_record = c_pdb.SymbolRecordHeader(self.symbol_stream) + except EOFError: + break + + # Read the symbol data, compensate for the length field in the header + symbol_data = BytesIO(self.symbol_stream.read(symbol_record.length - 2)) + symbol = self.SYMBOL_STRUCTS[symbol_record.type](symbol_data) + + yield symbol diff --git a/dissect/executable/pdb/helpers/exception.py b/dissect/executable/pdb/helpers/exception.py new file mode 100644 index 0000000..d09e05f --- /dev/null +++ b/dissect/executable/pdb/helpers/exception.py @@ -0,0 +1,10 @@ +class Error(Exception): + """Base exception for this module.""" + + +class UnknownTPIType(Exception): + """Unknown TPI type encountered.""" + + +class TPIShortEntry(Exception): + """Too short TPI entry.""" diff --git a/dissect/executable/pdb/helpers/pagestream.py b/dissect/executable/pdb/helpers/pagestream.py new file mode 100644 index 0000000..3780bc2 --- /dev/null +++ b/dissect/executable/pdb/helpers/pagestream.py @@ -0,0 +1,99 @@ +from __future__ import annotations + +import math +from typing import BinaryIO, Iterable, Tuple + +# External imports +from dissect.util.stream import AlignedStream + + +def pages(size: int, page_size: int) -> int: + """Return the number of pages within a page stream. + + Args: + size: The size of the root page. + page_size: The size of the current page. + + Returns: + The number of pages as an `int` type. + """ + + return math.ceil(size / page_size) + + +class PageStream(AlignedStream): + """Class to parse the streams within a PDB file. A PDB file is basically a file that + contains multiple other files in the form of streams. + + PDB layout (from: https://github.com/microsoft/microsoft-pdb) + + STREAM 1 = Pdb Header - Version information, and information to connect this PDB to the EXE + STREAM 2 = Tpi (Type Manager) - All the types used in the executable. + STREAM 3 = Dbi (Debug Manager) - Holds section contributions, and list of 'Mods' + STREAM 4 = NameMap - Holds a hashed string table + STREAM 4-(n+4) = n Mod's(Module Information) - Each Mod stream holds symbols and line numbers for one compiland + STREAM n+4 = Global Symbol Hash - An index that allows searching in global symbols by name + STREAM n+5 = Public Symbol Hash - An index that allows searching in public symbols by addresses + STREAM n+6 = Symbol Records - Actual symbol records of global and public symbols + STREAM n+7 = Type Hash - Hash used by the TPI stream. + + Args: + fh: A file handle to a PDB file. + pages: A list with the amount of pages found within the PDB file. + size: Size of the root stream within the PDB file. + page_size: Size of the page. + """ + + def __init__(self, fh: BinaryIO, pages: list[int], size: int, page_size: int) -> None: + super().__init__(size=size) + self.fh = fh + self.pages = pages + self.size = size + self.page_size = page_size + + def _read(self, offset: int, length: int) -> bytes: + """Read functionality implementation for page streams. + + Args: + size: Amount of bytes to read. + + Returns: + The amount of `bytes` that needed to be read by size. + """ + + page_num_start, offset_in_page = divmod(offset, self.page_size) + + page_num_end, end_offset = divmod(offset + length, self.page_size) + page_data = self._read_pages(self.pages[page_num_start : page_num_end + 1]) + + return page_data[offset_in_page:][:length] + + def _get_page(self, offset: int) -> Tuple[int, int]: + """Function to retrieve the start/end of a page and the start/end of the offset. + + Args: + offset: The offset to use to retrieve the start/end of the page stream. + + Returns: + The start/end of a page and the start/end offset as a `Tuple`. + """ + + return divmod(offset, self.page_size) + + def _read_pages(self, pages: Iterable) -> bytes: + """Read the pages within the current page stream. + + Args: + pages: The pages to read from the current stream. + + Returns: + `bytes` containing the page stream. + """ + + result = [] + + for page_number in pages: + self.fh.seek(page_number * self.page_size) + result.append(self.fh.read(self.page_size)) + + return b"".join(result) diff --git a/dissect/executable/pdb/helpers/tpi.py b/dissect/executable/pdb/helpers/tpi.py new file mode 100644 index 0000000..abb4da9 --- /dev/null +++ b/dissect/executable/pdb/helpers/tpi.py @@ -0,0 +1,671 @@ +from __future__ import annotations + +import os +from io import BytesIO +from typing import TYPE_CHECKING, Any, BinaryIO, Iterable, Tuple + +# External imports +from dissect.cstruct import Enum, Pointer, Structure, Union, cstruct + +# Local imports +from dissect.executable.pdb.helpers.c_pdb import ( + ARCH_POINTERS, + COMPILER_TYPES, + POINTER_TYPES, + c_pdb, +) +from dissect.executable.pdb.helpers.exception import UnknownTPIType + +if TYPE_CHECKING: + from dissect.executable.pdb.helpers.pagestream import PageStream + +# Low effort type aliases +CType = "Any" +TypeObject = "Any" + +# Default amount of bytes to skip when a numeric skip is performed for the type's name +DEFAULT_SKIP = 2 + + +def skip_numeric(type_data: BinaryIO) -> int: + """Implementation of the skipNumeric function in the official Microsoft PDB parsing code. + + This function is used to skip a certain amount of bytes for the variable length names in some + of the structures and member fields of different types. + + Args: + type_data: The TPI stream as a file-like object. + + Returns: + The amount of bytes to skip. + """ + + index = c_pdb.uint16(type_data) + if index < c_pdb.LEAF_ENUM_e.LF_NUMERIC.value: + return DEFAULT_SKIP + + pnum = index + 1 + + skip_values = { + c_pdb.LEAF_ENUM_e.LF_CHAR.value: DEFAULT_SKIP + c_pdb.BYTE.size, + c_pdb.LEAF_ENUM_e.LF_SHORT.value: DEFAULT_SKIP + c_pdb.SHORT.size, + c_pdb.LEAF_ENUM_e.LF_USHORT.value: DEFAULT_SKIP + c_pdb.WORD.size, + c_pdb.LEAF_ENUM_e.LF_LONG.value: DEFAULT_SKIP + 4, + c_pdb.LEAF_ENUM_e.LF_ULONG.value: DEFAULT_SKIP + 4, + c_pdb.LEAF_ENUM_e.LF_REAL32.value: DEFAULT_SKIP + 4, + c_pdb.LEAF_ENUM_e.LF_REAL64.value: DEFAULT_SKIP + 8, + c_pdb.LEAF_ENUM_e.LF_COMPLEX32.value: DEFAULT_SKIP + 8, + c_pdb.LEAF_ENUM_e.LF_DATE.value: DEFAULT_SKIP + 8, + c_pdb.LEAF_ENUM_e.LF_REAL80.value: DEFAULT_SKIP + 10, + c_pdb.LEAF_ENUM_e.LF_QUADWORD.value: DEFAULT_SKIP + c_pdb.QWORD.size, + c_pdb.LEAF_ENUM_e.LF_UQUADWORD.value: DEFAULT_SKIP + c_pdb.QWORD.size, + c_pdb.LEAF_ENUM_e.LF_REAL48.value: DEFAULT_SKIP + 6, + c_pdb.LEAF_ENUM_e.LF_COMPLEX64.value: DEFAULT_SKIP + 16, + c_pdb.LEAF_ENUM_e.LF_OCTWORD.value: DEFAULT_SKIP + 16, + c_pdb.LEAF_ENUM_e.LF_UOCTWORD.value: DEFAULT_SKIP + 16, + c_pdb.LEAF_ENUM_e.LF_REAL128.value: DEFAULT_SKIP + 16, + c_pdb.LEAF_ENUM_e.LF_COMPLEX80.value: DEFAULT_SKIP + 20, + c_pdb.LEAF_ENUM_e.LF_COMPLEX128.value: DEFAULT_SKIP + 32, + c_pdb.LEAF_ENUM_e.LF_VARSTRING.value: pnum + 4, + # https://learn.microsoft.com/en-us/office/vba/language/reference/user-interface-help/data-type-summary + c_pdb.LEAF_ENUM_e.LF_DECIMAL.value: DEFAULT_SKIP + 14, + c_pdb.LEAF_ENUM_e.LF_UTF8STRING.value: DEFAULT_SKIP, + } + + try: + return skip_values[index] + except KeyError: + return DEFAULT_SKIP + + +def get_name(data: BinaryIO) -> str: + """Retrieve the name of the member using the numeric skip that is used in PDB files. + + Args: + data: The TPI stream as a file-like object. + + Returns: + The name of the member as a string. + """ + + offset = data.tell() + skip = skip_numeric(type_data=data) + data.seek(offset + skip) + name = c_pdb.char[None](data).decode() + + return name + + +class TPI: + """Class for parsing the TPI stream of a PDB file. + + Args: + streams: The list with `PageStream` entries for this PDB. + pdb_cstruct: A `cstruct` instance that will be filled dynamically with the types defined in the TPI. + """ + + def __init__(self, streams: list[PageStream], pdb_cstruct: cstruct): + self.tpi_stream = streams[2] + self.pdb_cstruct = pdb_cstruct + self.tpi_header = c_pdb.TpiHeader(self.tpi_stream) + self.types = {} + + def parse_types(self): + """Parse the types specificied in the TPI stream within the minimal and maximal type index values. + + When a type is parsed it is added to the types dictionary that can be used to parse the specific types as + specified in the PDB. + """ + + for index in range(self.tpi_header.tiMin, self.tpi_header.tiMax): + tpi = c_pdb.TpiType(self.tpi_stream) + type_object = self._parse_type(index=index, tpi=tpi) + self.types[index] = type_object + + def typedefs(self) -> cstruct: + """Retrieve the PDB cstruct object containing the type definitions for further parsing. + + Returns: + The PDB `cstruct` object. + """ + return self.pdb_cstruct + + def _parse_type(self, index: int, tpi: cstruct) -> TypeObject: + """Function to parse the TPI type by checking which type is checking out, adding it dynamically to the + pdb_cstruct variable. + + Args: + tpi: A `TpiType` structure definition specified in c_pdb.py. + + Returns: + A `TypeObject` indicating the type that is associated with the specific index. + + Raises: + `TPIShortEntry` when an entry shorter or equal to 2 bytes is encountered. + `UnknownTPIType` if an unknown or unsupported yet TPI type is encountered. + """ + + if tpi.length <= 2: + # This seems to happen sporadically but doesn't break the parsing? + return + + type_data = BytesIO(tpi.type_data) + tpi_type = c_pdb.uint16(type_data) + tpi_type = c_pdb.LEAF_ENUM_e(tpi_type) + + try: + resolve_tpi = { + c_pdb.LEAF_ENUM_e.LF_MODIFIER: self._parse_lf_modifier, + c_pdb.LEAF_ENUM_e.LF_PROCEDURE: self._parse_lf_procedure, + c_pdb.LEAF_ENUM_e.LF_POINTER: self._parse_lf_pointer, + c_pdb.LEAF_ENUM_e.LF_ARGLIST: self._parse_lf_arglist, + c_pdb.LEAF_ENUM_e.LF_FIELDLIST: self._parse_lf_fieldlist, + c_pdb.LEAF_ENUM_e.LF_BITFIELD: self._parse_lf_bitfield, + c_pdb.LEAF_ENUM_e.LF_ARRAY: self._parse_lf_array, + c_pdb.LEAF_ENUM_e.LF_STRUCTURE: self._parse_lf_structure, + c_pdb.LEAF_ENUM_e.LF_STRUCTURE_16t: self._parse_lf_structure, + c_pdb.LEAF_ENUM_e.LF_CLASS: self._parse_lf_structure, + c_pdb.LEAF_ENUM_e.LF_CLASS_16t: self._parse_lf_structure, + c_pdb.LEAF_ENUM_e.LF_UNION: self._parse_lf_union, + c_pdb.LEAF_ENUM_e.LF_ENUM: self._parse_lf_enum, + c_pdb.LEAF_ENUM_e.LF_MFUNCTION: self._parse_lf_mfunction, + c_pdb.LEAF_ENUM_e.LF_MFUNCTION_16t: self._parse_lf_mfunction, + c_pdb.LEAF_ENUM_e.LF_METHODLIST: self._parse_lf_methodlist, + c_pdb.LEAF_ENUM_e.LF_METHODLIST_16t: self._parse_lf_methodlist, + c_pdb.LEAF_ENUM_e.LF_VTSHAPE: self._parse_lf_vtshape, + c_pdb.LEAF_ENUM_e.LF_VFTABLE: self._parse_lf_vftable, + } + return resolve_tpi[tpi_type](tpi_type=tpi_type, index=index, length=tpi.length, type_data=type_data) + except KeyError: + raise UnknownTPIType(f"Unsupported TPI: {tpi_type}") + + def _parse_lf_modifier(self, *args, **kwargs) -> Any: + """Parser for the LF_MODIFIER leaf type. + + Returns: + Type is dependent on the resolved type of the modifier. + """ + + # TODO + mod = c_pdb.LF_MODIFIER(kwargs["type_data"]) + return self._resolve_type(mod.type) + + def _parse_lf_procedure(self, *args, **kwargs) -> cstruct: + """Parser for the LF_PROCEDURE leaf type. + + Returns: + A `cstruct` object containing the pointer sizetype for the leaf type. + """ + + # TODO + # lf_proc = c_pdb.LF_PROCEDURE(kwargs["type_data"]) + # Better to return as a pointer as this can be parsed in structures + return self.pdb_cstruct.ptr + + def _parse_lf_pointer(self, *args, **kwargs) -> Pointer: + """Parser for the LF_POINTER leaf type. + + Returns: + A `Pointer` object for the leaf type. + """ + + ptr = c_pdb.LF_POINTER(kwargs["type_data"]) + try: + leaf_type = self._resolve_type(ptr.utype) + except UnknownTPIType: + print(f"unknown pointer type: 0x{ptr.utype:02x}") + ptr_type = ptr.attr.ptrtype + if ptr_type == c_pdb.CV_ptrtype_e.CV_PTR_64: + leaf_type = c_pdb.uint64 + else: + leaf_type = c_pdb.uint32 + + return Pointer(self.pdb_cstruct, leaf_type) + + def _parse_lf_arglist(self, *args, **kwargs) -> cstruct: + """Parser for the LF_ARGLIST leaf type. + + Returns: + A `cstruct` object containing the pointer sizetype for the leaf type. + """ + + # TODO + # lf_arglist = c_pdb.LF_ARGLIST(kwargs["type_data"]) + # Better to return as a pointer as this can be parsed in structures + return self.pdb_cstruct.ptr + + def _parse_lf_fieldlist(self, *args, **kwargs) -> Iterable: + """Parser for the LF_FIELDLIST leaf type. + + We use this fieldlist to parse the members within a struct. There's other types within a LF_FIELDLIST, + but these aren't used to build the structs we form with cstruct. + + Args: + length: The length of the TPI data. + type_data: The `bytes` out of which the type is build. + + Returns: + An `Iterable` list containing the different types associated with this fieldlist. + """ + + type_data = kwargs["type_data"] + length = kwargs["length"] + + leaf_types = { + c_pdb.LEAF_ENUM_e.LF_MEMBER: self._parse_lf_member, + c_pdb.LEAF_ENUM_e.LF_MEMBER_ST: self._parse_lf_member, + c_pdb.LEAF_ENUM_e.LF_STMEMBER: self._parse_lf_member, + c_pdb.LEAF_ENUM_e.LF_STMEMBER_ST: self._parse_lf_member, + c_pdb.LEAF_ENUM_e.LF_ENUMERATE: self._parse_lf_enumerate, + c_pdb.LEAF_ENUM_e.LF_METHOD: self._parse_lf_method, + c_pdb.LEAF_ENUM_e.LF_METHOD_ST: self._parse_lf_method, + c_pdb.LEAF_ENUM_e.LF_ONEMETHOD: self._parse_lf_method, + c_pdb.LEAF_ENUM_e.LF_ONEMETHOD_ST: self._parse_lf_method, + c_pdb.LEAF_ENUM_e.LF_VFUNCTAB: self._parse_lf_vfunctab, + c_pdb.LEAF_ENUM_e.LF_BCLASS: self._parse_lf_bclass, + c_pdb.LEAF_ENUM_e.LF_BINTERFACE: self._parse_lf_bclass, + c_pdb.LEAF_ENUM_e.LF_NESTTYPE: self._parse_lf_nesttype, + c_pdb.LEAF_ENUM_e.LF_NESTTYPE_ST: self._parse_lf_nesttype, + c_pdb.LEAF_ENUM_e.LF_NESTTYPEEX: self._parse_lf_nesttype, + c_pdb.LEAF_ENUM_e.LF_NESTTYPEEX_ST: self._parse_lf_nesttype, + c_pdb.LEAF_ENUM_e.LF_VBCLASS: self._parse_lf_vbclass, + c_pdb.LEAF_ENUM_e.LF_IVBCLASS: self._parse_lf_vbclass, + c_pdb.LEAF_ENUM_e.LF_INDEX: self._parse_lf_index, + c_pdb.LEAF_ENUM_e.LF_INDEX_16t: self._parse_lf_index, + } + + fieldlist = [] + + while type_data.tell() < length: + leaf_type = c_pdb.LEAF_ENUM_e(type_data) + + try: + member = leaf_types[leaf_type](leaf_type=leaf_type, type_data=type_data) + if leaf_type in [c_pdb.LEAF_ENUM_e.LF_MEMBER, c_pdb.LEAF_ENUM_e.LF_MEMBER_ST]: + # Only append members if these are of LF_MEMBER or LF_MEMBER_ST + fieldlist.append(member) + except KeyError: + # Leaf not supported yet + # logging.debug(f"_parse_lf_fieldlist | leaf_type: {leaf_type} - member: {member}") + pass + + # type_data is always 4 bytes aligned, align the data until we encounter another LF_MEMBER/LF_MEMBER_ST + # leaf type + type_data_pos = (type_data.tell() + 2) % 4 + if type_data_pos != 0: + type_data.seek(4 - type_data_pos, os.SEEK_CUR) + + return fieldlist + + def _parse_lf_bitfield(self, *args, **kwargs) -> Tuple[CType, int, int]: + """Parse any bitfields for the given type. + + Args: + type_data: The TPI stream to parse. + + Returns: + A `Tuple` containing the field type, number of bits, and the position within the type (offset). + """ + + type_data = kwargs["type_data"] + + type_index = c_pdb.uint32(type_data) + field_type = self._resolve_type(type_index) + + number_of_bits = c_pdb.uint8(type_data) & 0xFF + + position = c_pdb.uint8(type_data) + + return (field_type, number_of_bits, position) + + def _parse_lf_array(self, *args, **kwargs) -> CType: + """Parser for the LF_ARRAY leaf type. + + Args: + type_data: The `bytes` out of which the type is build. + + Returns: + The type of the array e.g. uint32[4], using `CType` to denote that it can be anything as the array in the + C-language is not limited to a specific type except for Enum, Union, and Structure. + """ + + array = c_pdb.LF_ARRAY(kwargs["type_data"]) + field_type = self._resolve_type(array.elemtype) + + # if the type is an Enum, Union or Structure we don't need to specify the count + if isinstance(field_type, Enum): + return getattr(self.pdb_cstruct, field_type.name) + + elif isinstance(field_type, Union): + return getattr(self.pdb_cstruct, field_type.name) + + elif isinstance(field_type, Structure): + return getattr(self.pdb_cstruct, field_type.name) + + else: + field_length = len(field_type) + + if field_length == 0: + count = 0 + else: + count = int(array.size / field_length) + + return field_type[count] + + def _parse_lf_structure(self, *args, **kwargs) -> Structure: + """Parser for the LF_STRUCTURE leaf type. + + This function will build a `cstruct.Structure` object ouf of the type data given. + This structure in turn can be used by the user to parse data from binary objects. + + Args: + index: The index of the field, this is used if no name is associated with the struct. + type_data: The `bytes` out of which the type is build. + + Returns: + An instance of `cstruct.Structure`. + """ + + if kwargs["tpi_type"] in [c_pdb.LEAF_ENUM_e.LF_STRUCTURE, c_pdb.LEAF_ENUM_e.LF_CLASS]: + lf_struct = c_pdb.LF_STRUCTURE(kwargs["type_data"]) + else: + lf_struct = c_pdb.LF_STRUCTURE_16t(kwargs["type_data"]) + struct_name = get_name(data=kwargs["type_data"]) + + # Some structs might not be named in a symbol file, these seem to have some kind of + # naming convention when the symbol is coming from Microsoft at least. + if struct_name in ["__unnamed", "", ""]: + struct_name = f"unnamed_{kwargs['index']:04x}" + + if hasattr(self.pdb_cstruct, struct_name): + # Retrieve the struct if we encountered this as a part of a forward declaration + cstruct_struct = getattr(self.pdb_cstruct, struct_name) + else: + # Instantiate a new empty `Structure` if this is a new declaration + cstruct_struct = Structure(self.pdb_cstruct, struct_name, []) + + if lf_struct.field != 0: + # forward declaration? + if not self.types[lf_struct.field]: + return cstruct_struct + + for member in self.types[lf_struct.field]: + try: + field_type = self._resolve_type(member_type=member.index) + except UnknownTPIType: + """An UnknownType exception can occur when we're parsing a PDB file that wasn't originated by + Microsoft. These user compiled binaries may contain types that are not specified in the Microsoft + PDB format. Set the field type to the respective uint based on the index number when we encounter + such a type.""" + # The 0x1000 range is reserved for 32-bit values + print(f"UnknownTPIType encountered: 0x{member.index:02x}") + if member.index & 0x1000: + field_type = c_pdb.uint32 + else: + field_type = c_pdb.uint64 + + if isinstance(field_type, tuple): + cstruct_struct.add_field( + name=member.name, type_=field_type[0], bits=field_type[1], offset=member.offset + ) + else: + cstruct_struct.add_field(name=member.name, type_=field_type, offset=member.offset) + + self.pdb_cstruct.addtype(name=struct_name, type_=cstruct_struct, replace=True) + + return cstruct_struct + + def _parse_lf_union(self, *args, **kwargs) -> Union: + """Parser for the LF_UNION leaf type. + + Args: + type_data: The `bytes` out of which the type is build. + + Returns: + An instance of `cstruct.Union`. + """ + + lf_union = c_pdb.LF_UNION(kwargs["type_data"]) + if lf_union.property.fwdref and lf_union.field: + field_type = self._resolve_type(member_type=lf_union.field) + return field_type + + union_name = lf_union.name.decode() + + if union_name in ["__unnamed", "", ""]: + union_name = f"unnamed_{kwargs['index']:04x}" + + if hasattr(self.pdb_cstruct, union_name): + cstruct_union = getattr(self.pdb_cstruct, union_name) + else: + cstruct_union = Union(self.pdb_cstruct, union_name, []) + + if lf_union.field: + for member in self.types[lf_union.field]: + field_type = self._resolve_type(member_type=member.index) + + # Check if this member is a bitfield + if isinstance(field_type, tuple): + cstruct_union.add_field( + name=member.name, type_=field_type[0], bits=field_type[1], offset=member.offset + ) + else: + cstruct_union.add_field(name=member.name, type_=field_type, offset=member.offset) + + self.pdb_cstruct.addtype(union_name, cstruct_union, replace=True) + + return cstruct_union + + def _parse_lf_enum(self, *args, **kwargs) -> Enum: + """Parser for the LF_ENUM leaf type. + + Args: + type_data: The `bytes` out of which the type is build. + + Returns: + An instance of `cstruct.Enum`. + """ + + lf_enum = c_pdb.LF_ENUM(kwargs["type_data"]) + enum_name = lf_enum.name.decode() + + # Likely forward declaration, not supported by cstruct yet, return a pointer + if not lf_enum.utype: + import ipdb + + ipdb.set_trace() + return self.pdb_cstruct.ptr + + field_type = self._resolve_type(member_type=lf_enum.utype) + + if hasattr(self.pdb_cstruct, enum_name): + cstruct_enum = getattr(self.pdb_cstruct, enum_name) + else: + cstruct_enum = Enum(self.pdb_cstruct, enum_name, field_type, {}) + + if isinstance(lf_enum.field, list): + enum_fields = {} + for member in self.types[lf_enum.field]: + enum_fields[member.name] = member.value + + cstruct_enum = Enum(self.pdb_cstruct, enum_name, field_type, enum_fields) + + self.pdb_cstruct.addtype(enum_name, cstruct_enum, replace=True) + + return cstruct_enum + + def _parse_lf_mfunction(self, *args, **kwargs) -> Structure: + """Parser for the LF_MFUNCTION and LF_MFUNCTION_16t leaf types. + + Returns: + A `Structure` object for the leaf type. + """ + + if kwargs["tpi_type"] == c_pdb.LEAF_ENUM_e.LF_MFUNCTION: + lf_function = c_pdb.LF_MFUNCTION(kwargs["type_data"]) + else: + lf_function = c_pdb.LF_MFUNCTION_16t(kwargs["type_data"]) + + return lf_function + + def _parse_lf_methodlist(self, *args, **kwargs) -> Structure: + """Parser for the LF_METHOD and LF_METHOD_16t leaf types. + + Returns: + A `Structure` object for the leaf type. + """ + + if kwargs["tpi_type"] == c_pdb.LEAF_ENUM_e.LF_METHODLIST: + # TODO + lf_methodlist = c_pdb.LF_METHOD(kwargs["type_data"]) + else: + lf_methodlist = c_pdb.LF_METHOD_16t(kwargs["type_data"]) + return lf_methodlist + + def _parse_lf_vtshape(self, *args, **kwargs) -> CType: + """Parser for the LF_VTABLE leaf type. + + Returns: + A `CType` object for the leaf type. + """ + + # TODO + try: + lf_vtshape = c_pdb.LF_VTABLE(kwargs["type_data"]) + except EOFError: + # Unsure how to parse this correctly + return c_pdb.LF_VTABLE + + return lf_vtshape + + def _parse_lf_vftable(self, *args, **kwargs) -> CType: + """Parser for the LF_VFTABLE leaf type. + + Returns: + A `CType` object for the leaf type. + """ + + # TODO + lf_vftable = c_pdb.LF_VFTABLE(kwargs["type_data"]) + + return self._resolve_type(member_type=lf_vftable.type) + + def _parse_lf_enumerate(self, *args, **kwargs) -> Structure: + """Parser for the LF_ENUMERATE leaf type. + + Returns: + A `Structure` object for the leaf type. + """ + + type_data = kwargs["type_data"] + offset = type_data.tell() + lf_enumerate = c_pdb.LF_ENUMERATE(type_data) + # Need to resolve the name seperately as there's a variable length + type_data.seek(offset + c_pdb.CV_fldattr_t.size) + lf_enumerate.name = get_name(data=type_data) + + return lf_enumerate + + def _parse_lf_member(self, *args, **kwargs) -> Structure: + """Parser for the LF_MEMBER and LF_MEMBER_ST leaf types. + + Returns: + A `Structure` object for the leaf type. + """ + + type_data = kwargs["type_data"] + offset = type_data.tell() + if kwargs["leaf_type"] in [c_pdb.LEAF_ENUM_e.LF_MEMBER, c_pdb.LEAF_ENUM_e.LF_MEMBER_ST]: + lf_member = c_pdb.LF_MEMBER(type_data) + # Need to resolve the name seperately as there's a variable length + type_data.seek(offset + c_pdb.CV_fldattr_t.size + c_pdb.CV_typ_t.size) + lf_member.name = get_name(data=type_data) + return lf_member + else: + c_pdb.LF_STMEMBER(type_data) + + def _parse_lf_method(self, *args, **kwargs) -> Structure: + """Parser for the LF_METHOD and LF_ONEMETHOD leaf types. + + Returns: + A `Structure` object for the leaf type. + """ + + type_data = kwargs["type_data"] + if kwargs["leaf_type"] == c_pdb.LEAF_ENUM_e.LF_METHOD: + return c_pdb.LF_METHOD(type_data) + + header = c_pdb.LF_ONEMETHOD_HEADER(type_data) + nember = c_pdb.LF_ONEMETHOD() + + nember.attr = header.attr + nember.index = header.index + nember.offset = type_data.tell() + if header.attr.mprop in [c_pdb.CV_methodprop_e.CV_MTintro, c_pdb.CV_methodprop_e.CV_MTpureintro]: + nember.offset = c_pdb.uint32(type_data) + + nember.name = c_pdb.char[None](type_data) + + def _parse_lf_vfunctab(self, *args, **kwargs) -> Structure: + """Parser for the LF_VFUNCTAB leaf type. + + Returns: + A `Structure` object for the leaf type. + """ + return c_pdb.LF_VFUNCTAB(kwargs["type_data"]) + + def _parse_lf_bclass(self, *args, **kwargs) -> Structure: + """Parser for the LF_BCLASS leaf type. + + Returns: + A `Structure` object for the leaf type. + """ + return c_pdb.LF_BCLASS(kwargs["type_data"]) + + def _parse_lf_nesttype(self, *args, **kwargs) -> Structure: + """Parser for the LF_NESTTYPE leaf type. + + Returns: + A `Structure` object for the leaf type. + """ + return c_pdb.LF_NESTTYPE(kwargs["type_data"]) + + def _parse_lf_vbclass(self, *args, **kwargs) -> Structure: + """Parser for the LF_VBCLASS leaf type. + + Returns: + A `Structure` object for the leaf type. + """ + c_pdb.LF_VBCLASS(kwargs["type_data"]) + + def _parse_lf_index(self, *args, **kwargs): + """Parser for the LF_INDEX leaf type.""" + kwargs["type_data"].read(2) + + def _resolve_type(self, member_type: int) -> CType: + """Function to resolve the type based on the index that is specified for the member type. + + Args: + member_type: An integer that is used to denote the type used for the specific member. + + Returns: + A `Ctype` instance that is associated with that specific member. + + Raises: + UnknownType exception if the TPI type is not known. + """ + + if member_type in ARCH_POINTERS: + # Just return a Pointer based on the architecture + return Pointer(self.pdb_cstruct, self.pdb_cstruct.ptr) + + elif member_type in POINTER_TYPES: + return Pointer(self.pdb_cstruct, POINTER_TYPES[member_type]) + + elif member_type in self.types: + return self.types[member_type] + + elif member_type in COMPILER_TYPES: + return COMPILER_TYPES[member_type] + + raise UnknownTPIType(f"unknown type: 0x{member_type:02x}") diff --git a/dissect/executable/pdb/helpers/utils.py b/dissect/executable/pdb/helpers/utils.py new file mode 100644 index 0000000..e7329cb --- /dev/null +++ b/dissect/executable/pdb/helpers/utils.py @@ -0,0 +1,63 @@ +import io +from contextlib import contextmanager +from typing import BinaryIO, Generator, Iterator + + +@contextmanager +def retain_file_offset( + fobj: BinaryIO, offset: int = None, whence: int = io.SEEK_SET +) -> Generator[BinaryIO, None, None]: + """Function to retain the file offset after searching for a specific pattern in the binary object. + + Args: + fobj: The file-like object we're searching through. + offset: The offset we need to retain. + whence: The type of action we perform the seek operation with. + + Yields: + The file-like object. + """ + + try: + pos = fobj.tell() + if offset is not None: + fobj.seek(offset, whence) + yield fobj + finally: + fobj.seek(pos) + + +def iter_find_needle(fobj: BinaryIO, needle: bytes, start_offset: int = None, max_offset: int = 0) -> Iterator[int]: + """Return an iterator yielding `offset` for found `needle` bytes in file `fobj`. + Side effects: file handle position due to seeking. + Args: + fobj: file like object + needle: needle to search for + start_offset: offset in file object to start searching from, if None it will search from current position + max_offset: how far we search for into the file, 0 for no limit + + Yields: + offset where `needle` was found in file `fobj` + """ + + needle_len = len(needle) + overlap_len = needle_len - 1 + saved = b"\x00" * overlap_len + if start_offset is not None: + fobj.seek(start_offset) + while True: + pos = fobj.tell() + if max_offset and pos > max_offset: + break + block = fobj.read(pos, min(8192, max_offset - start_offset)) + if not block: + continue + d = saved + block + p = -1 + while True: + p = d.find(needle, p + 1) + if p == -1 or max_offset and p > max_offset: + break + offset = pos + p - overlap_len + yield offset + saved = d[-overlap_len:] diff --git a/dissect/executable/pdb/pdb.py b/dissect/executable/pdb/pdb.py new file mode 100644 index 0000000..222c05b --- /dev/null +++ b/dissect/executable/pdb/pdb.py @@ -0,0 +1,219 @@ +import argparse +from typing import BinaryIO + +# External imports +from dissect.cstruct import cstruct + +# Local imports +from dissect.executable.pdb.helpers.c_pdb import PDB2_SIGNATURE, PDB7_SIGNATURE, c_pdb +from dissect.executable.pdb.helpers.dbi import DBI +from dissect.executable.pdb.helpers.pagestream import PageStream, pages +from dissect.executable.pdb.helpers.tpi import TPI + + +class PDBParser: + """Base class for parsing PDB files. + + Args: + fh: A file like object of a PDB file. + pdb_cstruct: A `cstruct` object to use while parsing the type definitions of the PDB file. + """ + + def __init__(self, fh: BinaryIO, pdb_cstruct: cstruct = None): + self.fh = fh + # Define a new cstruct object for the types, use an existing one if given + self.pdb_cstruct = cstruct() if not pdb_cstruct else pdb_cstruct + self.streams = [] + self.dbi = None + self.tpi = None + self.types = {} + self.machine = None + + def parse_streams(self): + """Parse the streams within the PDB file. + + The root stream is parsed so a list of streams can be build which consist of the different stream types that + are present within PDB files. See the `PageStream` class for a list with entry types. + """ + + self.root = self.root_def(self.root_stream) + for stream_length in self.root.streamLengths: + if stream_length.stream_size == 0xFFFFFFFF: + stream_length.stream_size = 0 + pagecount = pages(size=stream_length.stream_size, page_size=self.header.page_size) + self.streams.append( + PageStream( + fh=self.fh, + pages=self.pagecount_sizetype[pagecount](self.root_stream), + size=stream_length.stream_size, + page_size=self.header.page_size, + ) + ) + + self._parse_dbi() + self._parse_tpi() + + def _parse_dbi(self): + """Parse the DBI stream within the PDB file. + + Some information that is present within the DBI stream is used throughout the rest of the PDB parsing. + """ + + self.dbi = DBI(streams=self.streams) + self.machine = self.dbi.header.wMachine + # Set the pointer size based on the machine architecture + self.pdb_cstruct.ptr = self.pdb_cstruct.uint64 if self.machine == 0x8664 else self.pdb_cstruct.uint32 + + # Parse the information within the DBI stream + self.dbi.parse_info() + + def _parse_tpi(self): + """Parse the TPI stream within the PDB file.""" + + self.tpi = TPI(streams=self.streams, pdb_cstruct=self.pdb_cstruct) + self.tpi.parse_types() + + @property + def info(self) -> cstruct: + """Return the PDB header that was parsed.""" + return self.header + + @property + def symbols(self) -> dict: + """Return the symbols `dict` of the PDB file.""" + return self.dbi.symbols + + @property + def typedefs(self) -> cstruct: + """Return the `cstruct` object containing the type definitions of the PDB file.""" + return self.tpi.typedefs() + + def parse_types(self, pdb_cstruct: cstruct = None) -> TPI: + """Abstraction layer for parsing the types from the TPI stream. + + The parsed cstruct can be retrieved by using the `get_cstruct` function that is exposed from the `TPI` object. + + Args: + pdb_cstruct: A `cstruct` definition to use, create a new one if not provided. + + Returns: + A `TPI` object containing the types from the parsed PDB type definitions. + """ + + self.tpi.parse_types() + return self.tpi + + +class PDB2(PDBParser): + """Base class for parsing PDBv2 files. + + Args: + fh: A file like object of a PDB file. + header: The `cstruct` object for the PDB header, this is version specific. + root: The `cstruct` object to use for parsing the root stream, this is version specific. + pagecount_sizetype: The `cstruct` type to use for parsing the pages within the streams. + """ + + def __init__(self, fh: BinaryIO): + super().__init__(fh=fh) + self.header = c_pdb.PDB2_HEADER(self.fh) + self.root_def = c_pdb.ROOT_STREAM_V2 + self.pagecount_sizetype = c_pdb.uint16 + + # Retrieve the number of root pages + root_pages = pages(size=self.header.root_size, page_size=self.header.page_size) + + # Parse the root stream + root_pages = c_pdb.uint16[root_pages](self.fh) + self.root_stream = PageStream( + fh=self.fh, pages=root_pages, size=self.header.root_size, page_size=self.header.page_size + ) + self.parse_streams() + + +class PDB7(PDBParser): + """Base class for parsing PDBv7 files. + + Args: + fh: A file like object of a PDB file. + header: The `cstruct` object for the PDB header, this is version specific. + root: The `cstruct` object to use for parsing the root stream, this is version specific. + pagecount_sizetype: The `cstruct` type to use for parsing the pages within the streams. + """ + + def __init__(self, fh: BinaryIO): + super().__init__(fh=fh) + self.header = c_pdb.PDB7_HEADER(self.fh) + self.root_def = c_pdb.ROOT_STREAM_V7 + self.pagecount_sizetype = c_pdb.uint32 + + # Retrieve the number of root pages + root_pages = pages(size=self.header.root_size, page_size=self.header.page_size) + # Root pages in PDBv7 start from page_index * page_size + offset = self.header.root_page_index * self.header.page_size + self.fh.seek(offset) + + # Parse the root stream + root_pages = c_pdb.uint32[root_pages](self.fh) + self.root_stream = PageStream( + fh=self.fh, pages=root_pages, size=self.header.root_size, page_size=self.header.page_size + ) + self.parse_streams() + + +class PDB: + """Base class for parsing PDB files. + + Depending on the PDB version the right PDB structures will be used to parse the PDB file. + + Args: + pdb_file: The location of the PDB file to parse. + """ + + def __init__(self, pdb_file: str): + self.fh = open(pdb_file, "rb") + self._check_pdb_version() + + def _check_pdb_version(self): + """Pick the right PDB parser depending on the version.""" + + signature = self.fh.read(64) + + self.fh.seek(0) + # Check the PDB signature to see with which version we're dealing + if signature[: len(PDB7_SIGNATURE)] == PDB7_SIGNATURE: + self.pdb = PDB7(fh=self.fh) + elif signature[: len(PDB2_SIGNATURE)] == PDB2_SIGNATURE: + self.pdb = PDB2(fh=self.fh) + else: + self.fh.close() + raise NotImplementedError(f"Unsupported type observed: {signature}") + + self.header = self.pdb.header + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("-p", "--pdb", required=True, help="PDB file to parse.") + parser.add_argument( + "-i", "--info", required=False, action="store_true", help="Parse the PDB information within the DBI stream." + ) + + args = parser.parse_args() + print(f"Parsing PDB: {args.pdb}") + + pdb_file = PDB(pdb_file=args.pdb) + pdb = pdb_file.pdb + + if args.info: + dbi = pdb.dbi + pdb_cstruct = pdb.typedefs + + print(f"Found {len(dbi.module_info_list)} module info fields") + print(f"Found {len(dbi.section_map_items)} section map items") + print(f"Found {len(list(dbi.symbols))} symbols") + print(f"{len(pdb_cstruct.typedefs)} type definitions found in pdb_cstruct") + + +if __name__ == "__main__": + main() diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/_src/testpdb.cpp b/tests/_src/testpdb.cpp new file mode 100644 index 0000000..dd72bc2 --- /dev/null +++ b/tests/_src/testpdb.cpp @@ -0,0 +1,276 @@ +/* +* This CPP file is used to generate the .pdb files that are used to test dissect.executable.pdb. +* The following build command can be used in the Visual Studio settings to compile and generate the .pdb file: +* +* For 64-bit: +* /OUT:"\testpdb_x64.exe" /MANIFEST /NXCOMPAT /PDB:"\testpdb_x64.pdb" /DYNAMICBASE "kernel32.lib" "user32.lib" "gdi32.lib" "winspool.lib" "comdlg32.lib" "advapi32.lib" "shell32.lib" "ole32.lib" "oleaut32.lib" "uuid.lib" "odbc32.lib" "odbccp32.lib" /DEBUG /MACHINE:X64 /INCREMENTAL /SUBSYSTEM:CONSOLE /ERRORREPORT:PROMPT /NOLOGO /TLBID:1 +* +* For 32-bit: +* /OUT:"\testpdb_x86.exe" /MANIFEST /NXCOMPAT /PDB:"\testpdb_x86.pdb" /DYNAMICBASE "kernel32.lib" "user32.lib" "gdi32.lib" "winspool.lib" "comdlg32.lib" "advapi32.lib" "shell32.lib" "ole32.lib" "oleaut32.lib" "uuid.lib" "odbc32.lib" "odbccp32.lib" /DEBUG /MACHINE:X86 /INCREMENTAL /SUBSYSTEM:CONSOLE /ERRORREPORT:PROMPT /NOLOGO /TLBID:1 +*/ + +#include +#include + + +typedef unsigned __int64 QWORD, * PQWORD; + + +typedef struct _UNICODE_STRING { + USHORT Length; + USHORT MaximumLength; + PWSTR Buffer; +} UNICODE_STRING; +typedef UNICODE_STRING* PUNICODE_STRING; +typedef const UNICODE_STRING* PCUNICODE_STRING; + + +// -------------------------------------------------------------------------- +// Simple structs +struct simple_datatypes_struct { + char datatype_char; + byte datatype_byte; + short int datatype_short; + int datatype_int; + unsigned int datatype_unsigned_int; + float datatype_float; + double datatype_double; + long int datatype_long; + long long int datatype_longlong; + unsigned long int datatype_unsigned_long; + unsigned long long int datatype_unsigned_longlong; + signed char datatype_signed_char; + unsigned char datatype_unsigned_char; + long double datatype_long_double; + wchar_t datatype_wchar_t; + char16_t datatype_char16_t; + char32_t datatype_char32_t; +} simple_datatypes_struct; + +struct windows_datatypes_struct { + ATOM datatype_ATOM; + BOOL datatype_BOOL; + BOOLEAN datatype_BOOLEAN; + BYTE datatype_BYTE; + CCHAR datatype_CCHAR; + CHAR datatype_CHAR; + COLORREF datatype_COLORREF; + DWORD datatype_DWORD; + DWORDLONG datatype_DWORDLONG; + DWORD_PTR datatype_DWORD_PTR; + DWORD32 datatype_DWORD32; + DWORD64 datatype_DWORD64; + FLOAT datatype_FLOAT; + HACCEL datatype_HACCEL; + HALF_PTR datatype_HALF_PTR; + HANDLE datatype_HANDLE; + HBITMAP datatype_HBITMAP; + HBRUSH datatype_HBRUSH; + HCOLORSPACE datatype_HCOLORSPACE; + HCONV datatype_HCONV; + HCONVLIST datatype_HCONVLIST; + HCURSOR datatype_HCURSOR; + HDC datatype_HDC; + HDDEDATA datatype_HDDEDATA; + HDESK datatype_HDESK; + HDROP datatype_HDROP; + HDWP datatype_HDWP; + HENHMETAFILE datatype_HENHMETAFILE; + HFILE datatype_HFILE; + HFONT datatype_HFONT; + HGDIOBJ datatype_HGDIOBJ; + HGLOBAL datatype_HGLOBAL; + HHOOK datatype_HHOOK; + HICON datatype_HICON; + HINSTANCE datatype_HINSTANCE; + HKEY datatype_HKEY; + HKL datatype_HKL; + HLOCAL datatype_HLOCAL; + HMENU datatype_HMENU; + HMETAFILE datatype_HMETAFILE; + HMODULE datatype_HMODULE; + HMONITOR datatype_HMONITOR; + HPALETTE datatype_HPALETTE; + HPEN datatype_HPEN; + HRESULT datatype_HRESULT; + HRGN datatype_HRGN; + HRSRC datatype_HRSRC; + HSZ datatype_HSZ; + HWINSTA datatype_HWINSTA; + HWND datatype_HWND; + INT datatype_INT; + INT_PTR datatype_INT_PTR; + INT8 datatype_INT8; + INT16 datatype_INT16; + INT32 datatype_INT32; + INT64 datatype_INT64; + LANGID datatype_LANGID; + LCID datatype_LCID; + LCTYPE datatype_LCTYPE; + LGRPID datatype_LGRPID; + LONG datatype_LONG; + LONGLONG datatype_LONGLONG; + LONG_PTR datatype_LONG_PTR; + LONG32 datatype_LONG32; + LONG64 datatype_LONG64; + LPARAM datatype_LPARAM; + LPBOOL datatype_LPBOOL; + LPBYTE datatype_LPBYTE; + LPCOLORREF datatype_LPCOLORREF; + LPCSTR datatype_LPCSTR; + LPCTSTR datatype_LPCTSTR; + LPCVOID datatype_LPCVOID; + LPCWSTR datatype_LPCWSTR; + LPDWORD datatype_LPDWORD; + LPHANDLE datatype_LPHANDLE; + LPINT datatype_LPINT; + LPLONG datatype_LPLONG; + LPSTR datatype_LPSTR; + LPTSTR datatype_LPTSTR; + LPVOID datatype_LPVOID; + LPWORD datatype_LPWORD; + LPWSTR datatype_LPWSTR; + LRESULT datatype_LRESULT; + PBOOL datatype_PBOOL; + PBOOLEAN datatype_PBOOLEAN; + PBYTE datatype_PBYTE; + PCHAR datatype_PCHAR; + PCSTR datatype_PCSTR; + PCTSTR datatype_PCTSTR; + PCWSTR datatype_PCWSTR; + PDWORD datatype_PDWORD; + PDWORDLONG datatype_PDWORDLONG; + PDWORD_PTR datatype_PDWORD_PTR; + PDWORD32 datatype_PDWORD32; + PDWORD64 datatype_PDWORD64; + PFLOAT datatype_PFLOAT; + PHALF_PTR datatype_PHALF_PTR; + PHANDLE datatype_PHANDLE; + PHKEY datatype_PHKEY; + PINT datatype_PINT; + PINT_PTR datatype_PINT_PTR; + PINT8 datatype_PINT8; + PINT16 datatype_PINT16; + PINT32 datatype_PINT32; + PINT64 datatype_PINT64; + PLCID datatype_PLCID; + PLONG datatype_PLONG; + PLONGLONG datatype_PLONGLONG; + PLONG_PTR datatype_PLONG_PTR; + PLONG32 datatype_PLONG32; + PLONG64 datatype_PLONG64; + PSHORT datatype_PSHORT; + PSIZE_T datatype_PSIZE_T; + PSSIZE_T datatype_PSSIZE_T; + PSTR datatype_PSTR; + PTBYTE datatype_PTBYTE; + PTCHAR datatype_PTCHAR; + PTSTR datatype_PTSTR; + PUCHAR datatype_PUCHAR; + PUHALF_PTR datatype_PUHALF_PTR; + PUINT datatype_PUINT; + PUINT_PTR datatype_PUINT_PTR; + PUINT8 datatype_PUINT8; + PUINT16 datatype_PUINT16; + PUINT32 datatype_PUINT32; + PUINT64 datatype_PUINT64; + PULONG datatype_PULONG; + PULONGLONG datatype_PULONGLONG; + PULONG_PTR datatype_PULONG_PTR; + PULONG32 datatype_PULONG32; + PULONG64 datatype_PULONG64; + PUSHORT datatype_PUSHORT; + PVOID datatype_PVOID; + PWCHAR datatype_PWCHAR; + PWORD datatype_PWORD; + PWSTR datatype_PWSTR; + QWORD datatype_QWORD; + SC_HANDLE datatype_SC_HANDLE; + SC_LOCK datatype_SC_LOCK; + SERVICE_STATUS_HANDLE datatype_SERVICE_STATUS_HANDLE; + SHORT datatype_SHORT; + SIZE_T datatype_SIZE_T; + SSIZE_T datatype_SSIZE_T; + TBYTE datatype_TBYTE; + TCHAR datatype_TCHAR; + UCHAR datatype_UCHAR; + UHALF_PTR datatype_UHALF_PTR; + UINT datatype_UINT; + UINT_PTR datatype_UINT_PTR; + UINT8 datatype_UINT8; + UINT16 datatype_UINT16; + UINT32 datatype_UINT32; + UINT64 datatype_UINT64; + ULONG datatype_ULONG; + ULONGLONG datatype_ULONGLONG; + ULONG_PTR datatype_ULONG_PTR; + ULONG32 datatype_ULONG32; + ULONG64 datatype_ULONG64; + UNICODE_STRING datatype_UNICODE_STRING; + USHORT datatype_USHORT; + USN datatype_USN; + VOID *datatype_VOID; + WCHAR datatype_WCHAR; + WORD datatype_WORD; + WPARAM datatype_WPARAM; + std::nullptr_t datatype_nullptr_t; + __wchar_t datatype___wchar_t; + __int8 datatype___int8; + __int16 datatype___int16; + __int32 datatype___int32; + __int64 datatype___int64; +} windows_datatypes_struct; +// -------------------------------------------------------------------------- + +// -------------------------------------------------------------------------- +// Simple enum definitions +typedef enum _enum_uint16_t : uint16_t { + a = 0x0, + b = 0xFF, +} enum_uint16_t; + +typedef enum _enum_int : int { + c = 0x0, + d = 0xFFFF, +} enum_int; + +typedef enum _enum_int64 : int64_t { + e = 0x0, + f = 0xFFFFFFFF, +} enum_int64; +// -------------------------------------------------------------------------- + +// -------------------------------------------------------------------------- +// Combined types +// Structure definitions containing an enum + +struct _enum_structure { + char datatype_char; + enum_uint16_t datatype_enum_uint16_t; + enum_int datatype_enum_int; + enum_int64 datatype_enum_int64_t; + HRESULT datatype_HRESULT; +} enum_structure; + +// Structure containing a struct +struct _struct_structure { + char datatype_char; + _enum_structure datatype_enumstruct; + UNICODE_STRING datatype_unicodestring; + __int64 datatype_int64; +} struct_structure; + +// Structure containing an enum and a struct +struct _enum_and_struct_structure { + char datatype_char; + _enum_structure datatype_enum_struct; + _struct_structure datatype_struct_struct; + LONGLONG datatype_LONGLONG; +} enum_and_struct_struct; +// -------------------------------------------------------------------------- + + +int main() +{ + std::cout << "kusjesvanSRT<3\n"; +} diff --git a/tests/data/hello_world.out b/tests/data/hello_world.out old mode 100755 new mode 100644 diff --git a/tests/data/hello_world.stripped.out b/tests/data/hello_world.stripped.out old mode 100755 new mode 100644 diff --git a/tests/data/testpdb_x64.pdb b/tests/data/testpdb_x64.pdb new file mode 100644 index 0000000..1b9b819 Binary files /dev/null and b/tests/data/testpdb_x64.pdb differ diff --git a/tests/data/testpdb_x86.pdb b/tests/data/testpdb_x86.pdb new file mode 100644 index 0000000..c512b8b Binary files /dev/null and b/tests/data/testpdb_x86.pdb differ diff --git a/tests/test_dump.py b/tests/test_dump.py index 8cf0be7..d611596 100644 --- a/tests/test_dump.py +++ b/tests/test_dump.py @@ -2,10 +2,11 @@ from pathlib import Path import pytest -from util import data_file from dissect.executable import ELF +from .util import data_file + @pytest.mark.parametrize( "file_name", diff --git a/tests/test_pdbv7.py b/tests/test_pdbv7.py new file mode 100644 index 0000000..ac22dcf --- /dev/null +++ b/tests/test_pdbv7.py @@ -0,0 +1,138 @@ +# External dependencies +from dissect.cstruct import cstruct + +from dissect.executable.pdb import PDB + +c_def = """ +typedef long HRESULT; +typedef WCHAR PWSTR; + +typedef struct _UNICODE_STRING { + USHORT Length; + USHORT MaximumLength; + PWSTR Buffer; +} UNICODE_STRING; + +enum enum_uint16_t : uint16_t { + a = 0x0, + b = 0xFF, +}; + +enum enum_int : int { + c = 0x0, + d = 0xFFFF, +}; + +enum enum_int64 : int64_t { + e = 0x0, + f = 0xFFFFFFFF, +}; + +struct _enum_structure { + char datatype_char; + enum_uint16_t datatype_enum_uint16_t; + enum_int datatype_enum_int; + enum_int64 datatype_enum_int64_t; + HRESULT datatype_HRESULT; +} enum_structure; + +// Structure containing a struct +struct _struct_structure { + char datatype_char; + _enum_structure datatype_enumstruct; + UNICODE_STRING datatype_unicodestring; + __int64 datatype_int64; +} struct_structure; + +// Structure containing an enum and a struct +struct _enum_and_struct_structure { + char datatype_char; + _enum_structure datatype_enum_struct; + _struct_structure datatype_struct_struct; + LONGLONG datatype_LONGLONG; +} enum_and_struct_struct; +""" + +test_typedefs = cstruct() +test_typedefs = test_typedefs.load(c_def, align=True) + + +def test_valid_pdb7header(): + pdb_file = PDB(pdb_file="tests/data/testpdb_x64.pdb") + assert pdb_file.header.signature == b"Microsoft C/C++ MSF 7.00\r\n\x1ADS\x00\x00\x00" + + +def test_invalid_pdb7header(): + pdb_file = PDB(pdb_file="tests/data/testpdb_x64.pdb") + assert pdb_file.header.signature != b"Microsoft C/C++ MSF 7.00\r\nblah\x00\x00\x00" + + +def test_pdb7_pagestreams_count(): + pdb_file = PDB(pdb_file="tests/data/testpdb_x64.pdb") + assert pdb_file.pdb.root.dStreams == 110 + assert len(pdb_file.pdb.streams) == pdb_file.pdb.root.dStreams + + +def test_pdb7_dbi_machinetype(): + pdb_file = PDB(pdb_file="tests/data/testpdb_x64.pdb") + assert pdb_file.pdb.dbi.header.wMachine == 0x8664 + + +def test_pdb7_dbi_symbol_records_index(): + pdb_file = PDB(pdb_file="tests/data/testpdb_x64.pdb") + assert pdb_file.pdb.dbi.header.snSymRecs == 0x67 + + +def test_pdb7_dbi_symbol_info(): + pdb_file = PDB(pdb_file="tests/data/testpdb_x64.pdb") + symbols = pdb_file.pdb.symbols + + assert len(symbols) == 1890 + assert symbols["std::memory_order_relaxed"].type_index == 0x159F + + +def test_pdb7_dbi_module_info(): + pdb_file = PDB(pdb_file="tests/data/testpdb_x64.pdb") + + assert len(pdb_file.pdb.dbi.module_info_list) == 52 + + first_object_name = pdb_file.pdb.dbi.module_info_list[0].object_name + assert first_object_name == b"C:\\Users\\user\\source\\repos\\dissect.pdb\\dissect.pdb\\x64\\Debug\\dissect.pdb.obj" + + last_object_name = pdb_file.pdb.dbi.module_info_list[-2].object_name + assert last_object_name == b"C:\\Program Files (x86)\\Windows Kits\\10\\lib\\10.0.22621.0\\ucrt\\x64\\ucrtd.lib" + + +def test_pdb7_tpistream_pagesize(): + pdb_file = PDB(pdb_file="tests/data/testpdb_x64.pdb") + tpi_stream = pdb_file.pdb.streams[2] + tpi_stream.page_size == 0x1000 + + +def test_pdb7_pdb_cstruct_typedefs(): + pdb_file = PDB(pdb_file="tests/data/testpdb_x64.pdb") + pdb_cstruct = pdb_file.pdb.typedefs + + assert len(pdb_cstruct.typedefs) == 545 + assert "simple_datatypes_struct" in pdb_cstruct.typedefs + assert "windows_datatypes_struct" in pdb_cstruct.typedefs + + +def test_pdb7_pdb_cstruct_names(): + pdb_file = PDB(pdb_file="tests/data/testpdb_x64.pdb") + pdb_cstruct = pdb_file.pdb.typedefs + + test_struct_struct_names = [i.name for i in test_typedefs.typedefs["_struct_structure"].fields] + pdb_struct_struct_names = [i.name for i in pdb_cstruct.typedefs["_struct_structure"].fields] + assert test_struct_struct_names == pdb_struct_struct_names + + +def test_pdb7_pdb_cstruct_parsing(): + pdb_file = PDB(pdb_file="tests/data/testpdb_x64.pdb") + pdb_cstruct = pdb_file.pdb.typedefs + + test_enum_structure = pdb_cstruct._enum_structure( + b"\x02\x00\x00\x00\xFF\xFF\xFF\xFF\x00\x00\x00\x00\x00\x00\x00\x00\x01\x02\x03\x04" + ) + assert test_enum_structure.datatype_char == b"\x02" + assert test_enum_structure.datatype_HRESULT == 0x4030201