Showing posts with label Windows. Show all posts
Showing posts with label Windows. Show all posts

Saturday, July 29, 2017

Windows developers' misconception about UNIX.

While reading osronline.com forum on Windows file system development I ran into a common misconception among Windows developers regarding UNIX design. http://osronline.com/cf.cfm?PageURL=showThread.CFM?link=285260
<QUOTE>
The essential difference between how the NT kernel works and how Unix was
designed is that NT caches streams of data (above the file system), whereas
on Unix data is cached at the block layer.
</QUOTE>
I spent 5 minutes to bust it.
This is true only for ancient *NIX kernels. Modern kernels use the same technique as NT with caching backed by file mapping structures.
For example below is a call stack from my test machine running the Linux kernel (4.12.2) when ext4 read operation (ext4_file_read_iter) called the "Linux cache manager" ( do_generic_file_read -> page_cache_sync_readahead ) to bring data in the cache backed by mapped file structures( struct address_space ) when processing the read() system call.
This resulted in a recursive call to mapping->a_ops->readpages into a file system's ext4_readpages . This is an analogue of a cached read in NT. Mac OS X uses the same caching by file mapping technique borrowed from BSD.
(gdb) bt
#0  ext4_readpages (file=0xffff88001d59b300, mapping=0xffff88001d1d56c0, pages=0xffffc90000817c30, nr_pages=1) at ../fs/ext4/inode.c:3308
#1  0xffffffff811b6288 in read_pages (gfp=<optimised out>, nr_pages=<optimised out>, pages=<optimised out>, filp=<optimised out>, mapping=<optimised out>) at ../mm/readahead.c:121
#2  __do_page_cache_readahead (mapping=<optimised out>, filp=<optimised out>, offset=1, nr_to_read=<optimised out>, lookahead_size=<optimised out>) at ../mm/readahead.c:199
#3  0xffffffff811b64b8 in ra_submit (ra=<optimised out>, ra=<optimised out>, ra=<optimised out>, filp=<optimised out>, mapping=<optimised out>) at ../mm/internal.h:66
#4  ondemand_readahead (mapping=0xffff88001d1d56c0, ra=0xffff88001d59b398, filp=0xffff88001d59b300, hit_readahead_marker=<optimised out>, offset=0, req_size=<optimised out>) at ../mm/readahead.c:478
#5  0xffffffff811b678e in page_cache_sync_readahead (mapping=<optimised out>, ra=<optimised out>, filp=<optimised out>, offset=<optimised out>, req_size=<optimised out>) at ../mm/readahead.c:510
#6  0xffffffff811a7a62 in do_generic_file_read (written=<optimised out>, iter=<optimised out>, ppos=<optimised out>, filp=<optimised out>) at ../mm/filemap.c:1813
#7  generic_file_read_iter (iocb=0x20000, iter=<optimised out>) at ../mm/filemap.c:2069
#8  0xffffffff812d1386 in ext4_file_read_iter (iocb=0xffff88001d59b300, to=0xffff88001d1d56c0) at ../fs/ext4/file.c:70
#9  0xffffffff81237680 in call_read_iter (file=<optimised out>, iter=<optimised out>, kio=<optimised out>) at ../include/linux/fs.h:1728
#10 new_sync_read (ppos=<optimised out>, len=<optimised out>, buf=<optimised out>, filp=<optimised out>) at ../fs/read_write.c:440
#11 __vfs_read (file=0xffff88001d59b300, buf=<optimised out>, count=<optimised out>, pos=0xffffc90000817f18) at ../fs/read_write.c:452
#12 0xffffffff81237cc3 in vfs_read (file=0xffff88001d59b300, buf=0x7fb92a0cb000 <error: Cannot access memory at address 0x7fb92a0cb000>, count=<optimised out>, pos=0xffffc90000817f18)
    at ../fs/read_write.c:473
#13 0xffffffff81239385 in SYSC_read (count=<optimised out>, buf=<optimised out>, fd=<optimised out>) at ../fs/read_write.c:589
#14 SyS_read (fd=<optimised out>, buf=140433251151872, count=131072) at ../fs/read_write.c:582
#15 0xffffffff818aaffb in entry_SYSCALL_64 () at ../arch/x86/entry/entry_64.S:203

(gdb) f 4
#4  ondemand_readahead (mapping=0xffff88001d1d56c0, ra=0xffff88001d59b398, filp=0xffff88001d59b300, hit_readahead_marker=<optimised out>, offset=0, req_size=<optimised out>) at ../mm/readahead.c:478
478  return ra_submit(ra, mapping, filp);

(gdb) p/x *mapping
$14 = {host = 0xffff88001d1d5548, page_tree = {gfp_mask = 0x1180020, rnode = 0x0}, tree_lock = {{rlock = {raw_lock = {val = {counter = 0x0}}}}}, i_mmap_writable = {counter = 0x0}, i_mmap = {
    rb_node = 0x0}, i_mmap_rwsem = {count = {counter = 0x0}, wait_list = {next = 0xffff88001d1d56f0, prev = 0xffff88001d1d56f0}, wait_lock = {raw_lock = {val = {counter = 0x0}}}, osq = {tail = {
        counter = 0x0}}, owner = 0x0}, nrpages = 0x0, nrexceptional = 0x0, writeback_index = 0x0, a_ops = 0xffffffff81a3a680, flags = 0x0, private_lock = {{rlock = {raw_lock = {val = {
            counter = 0x0}}}}}, gfp_mask = 0x14200ca, private_list = {next = 0xffff88001d1d5740, prev = 0xffff88001d1d5740}, private_data = 0x0}
            
(gdb) ptype mapping
type = struct address_space {
    struct inode *host;
    struct radix_tree_root page_tree;
    spinlock_t tree_lock;
    atomic_t i_mmap_writable;
    struct rb_root i_mmap;
    struct rw_semaphore i_mmap_rwsem;
    unsigned long nrpages;
    unsigned long nrexceptional;
    unsigned long writeback_index;
    const struct address_space_operations *a_ops;
    unsigned long flags;
    spinlock_t private_lock;
    gfp_t gfp_mask;
    struct list_head private_list;
    void *private_data;
} *

(gdb) f 1
#1  0xffffffff811b6288 in read_pages (gfp=<optimised out>, nr_pages=<optimised out>, pages=<optimised out>, filp=<optimised out>, mapping=<optimised out>) at ../mm/readahead.c:121
121   ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
(gdb) l
116  int ret;
117 
118  blk_start_plug(&plug);
119 
120  if (mapping->a_ops->readpages) {
121   ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
122   /* Clean up the remaining pages */
123   put_pages_list(pages);
124   goto out;
125  }

(gdb) f 9
#9  0xffffffff81237680 in call_read_iter (file=<optimised out>, iter=<optimised out>, kio=<optimised out>) at ../include/linux/fs.h:1728
1728  return file->f_op->read_iter(kio, iter);
(gdb) l
1723 } ____cacheline_aligned;
1724 
1725 static inline ssize_t call_read_iter(struct file *file, struct kiocb *kio,
1726          struct iov_iter *iter)
1727 {
1728  return file->f_op->read_iter(kio, iter);
1729 }
1730 
1731 static inline ssize_t call_write_iter(struct file *file, struct kiocb *kio,
1732           struct iov_iter *iter)
(gdb) 

Monday, July 3, 2017

FltCreateFile and top device.

FltCreateFile calls IoCreateFileEx with IO_DRIVER_CREATE_CONTEXT.DeviceObjectHint pointing to the Filter Manager's filter object and then calls the lower registered filters. That allows the created file object to have TopDeviceObjectHint pointing to the Filter Manager's object.
 # Child-SP          RetAddr           Call Site
00 ffffe101`3fffebf8 fffff801`92125200 FLTMGR!FltpCreate
01 ffffe101`3fffec00 fffff801`9213058b nt!IopParseDevice+0x7f0
02 ffffe101`3fffedd0 fffff801`921340c0 nt!ObpLookupObjectName+0x46b
03 ffffe101`3fffefa0 fffff801`9213803a nt!ObOpenObjectByNameEx+0x1e0
04 ffffe101`3ffff0e0 fffff801`920b0eb4 nt!IopCreateFile+0x3aa
05 ffffe101`3ffff180 fffff808`485240d5 nt!IoCreateFileEx+0x124
06 ffffe101`3ffff210 fffff808`4853d32d FLTMGR!FltpCreateFile+0x1cd
07 ffffe101`3ffff310 fffff808`4b6a79f8 FLTMGR!FltCreateFile+0x8d
08 ffffe101`3ffff3a0 fffff808`484f4b4c avscan!AvPreCreate+0x378 [d:\work\avscan\filter\avscan.c @ 2106]
09 ffffe101`3ffff4b0 fffff808`484f46ec FLTMGR!FltpPerformPreCallbacks+0x2ec
0a ffffe101`3ffff5d0 fffff808`48526117 FLTMGR!FltpPassThroughInternal+0x8c
0b ffffe101`3ffff600 fffff801`92125200 FLTMGR!FltpCreate+0x2d7
0c ffffe101`3ffff6b0 fffff801`9213058b nt!IopParseDevice+0x7f0
0d ffffe101`3ffff880 fffff801`921340c0 nt!ObpLookupObjectName+0x46b
0e ffffe101`3ffffa50 fffff801`920c9e90 nt!ObOpenObjectByNameEx+0x1e0

0: kd> dt nt!_FILE_OBJECT ffff948c621a3330
   +0x000 Type             : 0n5
   +0x002 Size             : 0n216
   +0x008 DeviceObject     : 0xffff948c`60a3bc80 _DEVICE_OBJECT
   +0x010 Vpb              : 0xffff948c`60a556e0 _VPB
   +0x018 FsContext        : 0xffff948c`6111a740 Void
   +0x020 FsContext2       : 0xffff8483`76cf78f0 Void
   +0x028 SectionObjectPointer : (null) 
   +0x030 PrivateCacheMap  : (null) 
   +0x038 FinalStatus      : 0n0
   +0x040 RelatedFileObject : (null) 
   +0x048 LockOperation    : 0 ''
   +0x049 DeletePending    : 0 ''
   +0x04a ReadAccess       : 0x1 ''
   +0x04b WriteAccess      : 0 ''
   +0x04c DeleteAccess     : 0 ''
   +0x04d SharedRead       : 0x1 ''
   +0x04e SharedWrite      : 0x1 ''
   +0x04f SharedDelete     : 0x1 ''
   +0x050 Flags            : 0x40000
   +0x058 FileName         : _UNICODE_STRING "\"
   +0x068 CurrentByteOffset : _LARGE_INTEGER 0x0
   +0x070 Waiters          : 0
   +0x074 Busy             : 0
   +0x078 LastLock         : (null) 
   +0x080 Lock             : _KEVENT
   +0x098 Event            : _KEVENT
   +0x0b0 CompletionContext : (null) 
   +0x0b8 IrpListLock      : 0
   +0x0c0 IrpList          : _LIST_ENTRY [ 0xffff948c`621a33f0 - 0xffff948c`621a33f0 ]
   +0x0d0 FileObjectExtension : 0xffff948c`6226f1b0 Void
   
0: kd> dq 0xffff948c`6226f1b0
ffff948c`6226f1b0  00000000`00000000 00000000`00000000
ffff948c`6226f1c0  ffff948c`60dff0d0 00000000`00000000
ffff948c`6226f1d0  ffff948c`6243f2c0 00000000`00000000
ffff948c`6226f1e0  00000000`00000000 00000000`00000000
ffff948c`6226f1f0  00000000`00000000 00000000`00000000
ffff948c`6226f200  61436d4d`02120006 00000000`0000034c
ffff948c`6226f210  ffff8483`7535ed10 ffff948c`62161e28
ffff948c`6226f220  ffff948c`6221da78 00000000`00000000

0: kd> dq ffff948c`60dff0d0
ffff948c`60dff0d0  ffff948c`610734a0 00000000`00000000
ffff948c`60dff0e0  00000000`00000000 00000000`00000000
ffff948c`60dff0f0  65536d4d`02060003 6c8da38a`069a7123
ffff948c`60dff100  00000000`00000000 0000024e`49c8000a
ffff948c`60dff110  0000024e`49c80fff 00000000`00000000
ffff948c`60dff120  00000000`00000000 00000000`00000000
ffff948c`60dff130  00000000`00000000 00000000`00000000
ffff948c`60dff140  00000000`00000002 00000000`00000000

0: kd> !object ffff948c`610734a0
Object: ffff948c610734a0  Type: (ffff948c5e34eb00) Device
    ObjectHeader: ffff948c61073470 (new version)
    HandleCount: 0  PointerCount: 1
    
0: kd> !devstack ffff948c610734a0
  !DevObj           !DrvObj            !DevExt           ObjectName
> ffff948c610734a0  \FileSystem\FltMgr ffff948c610735f0  
  ffff948c61048060  \FileSystem\fastfatffff948c610481b0 
  
0: kd> !vpb 0xffff948c`60a556e0
Vpb at 0xffff948c60a556e0
Flags: 0x1 mounted 
DeviceObject: 0xffff948c61048060
RealDevice:   0xffff948c60a3bc80
RefCount: 8
Volume Label: 

Thursday, June 8, 2017

Windows. Cache prefetching


00 nt!KiSwapContext
01 nt!KiSwapThread
02 nt!KiCommitThreadWait
03 nt!KeWaitForSingleObject
04 nt!MiWaitForInPageComplete
05 nt!MiPfCompleteInPageSupport
06 nt!MiPfCompletePrefetchIos
07 nt!MmWaitForCacheManagerPrefetch
08 nt!CcFetchDataForRead
09 nt!CcMapAndCopyFromCache
0a nt!CcCopyReadEx
0b nt!CcCopyRead

Monday, April 3, 2017

TLB flushing call on Windows

nt!KiRetireDpcList+0xd7
nt!KxRetireDpcList+0x5 (TrapFrame @ fffff800`cc332e70)
nt!KiDispatchInterruptContinue
nt!KiDpcInterrupt+0xca (TrapFrame @ ffffd000`a9b34d90)
nt!MiFlushTbList+0x20c
nt!MiDeleteSystemPagableVm+0x4d9
nt!MiPurgeSpecialPoolPaged+0x18
nt!MmFreeSpecialPool+0x3cf
nt!ExDeferredFreePool+0x677
nt!VerifierExFreePoolWithTag+0x44

Thursday, February 16, 2017

What a BSOD!


APC_INDEX_MISMATCH (1)
This is a kernel internal error. The most common reason to see this
bugcheck is when a filesystem or a driver has a mismatched number of
calls to disable and re-enable APCs. The key data item is the
Thread->CombinedApcDisable field. This consists of two separate 16-bit
fields, the SpecialApcDisable and the KernelApcDisable. A negative value
of either indicates that a driver has disabled special or normal APCs
(respectively) without re-enabling them; a positive value indicates that
a driver has enabled special or normal APCs (respectively) too many times.
Arguments:
Arg1: 00007ffeb44461b4, Address of system call function or worker routine
Arg2: 0000000000000000, Thread->ApcStateIndex
Arg3: 000000000000ffff, (Thread->SpecialApcDisable << 16) | Thread->KernelApcDisable
Arg4: ffffc6816b407b80, Call type (0 - system call, 1 - worker routine)

Sunday, January 29, 2017

Current process when closing a kernel handle.

If you call PsGetCurrentProcess() in a filter or driver when processing IRP_MJ_CLEANUP for a kernel handle the system process is returned as NtClose() calls KeStackAttachProcess() if the handle belongs to a system process kernel table.

2: kd> !thread ffffc5006e6da080
THREAD ffffc5006e6da080  Cid 1588.05ec  Teb: 00000000002aa000 Win32Thread: 0000000000000000 WAIT: (WrResource) KernelMode Non-Alertable
    ffffc5006be8eb70  SynchronizationEvent
IRP List:
    ffffc5006e2ba140: (0006,04c0) Flags: 00000404  Mdl: 00000000
    ffffc50075b8aae0: (0006,0118) Flags: 00060000  Mdl: 00000000
Not impersonating
DeviceMap                 ffffd58256416bd0
Owning Process            ffffc500733ff080       Image:         XXXXXXXX
Attached Process          ffffc5006b8b66c0       Image:        System

Wednesday, January 25, 2017

Microsoft Security Essentials content scan callback to the service.

Below is a stack when a MSE file system filter(WdFilter.sys) called a service(MsMpEng.exe) to perform file content scan on file open.


00 nt!KiSwapContext
01 nt!KiSwapThread
02 nt!KiCommitThreadWait
03 nt!KeWaitForMultipleObjects
04 nt!FsRtlCancellableWaitForMultipleObjects
05 FLTMGR!FltSendMessage
06 WdFilter!MpScanFile
07 WdFilter!MpAmPostCreate
08 WdFilter!MpPostCreate
09 FLTMGR!FltpPerformPostCallbacks
0a FLTMGR!FltpPassThroughCompletionWorker
0b FLTMGR!FltpLegacyProcessingAfterPreCallbacksCompleted
0c FLTMGR!FltpCreate
16 nt!IopParseDevice
17 nt!ObpLookupObjectName
18 nt!ObOpenObjectByNameEx
19 nt!IopCreateFile
1a nt!NtCreateFile
1b nt!KiSystemServiceCopyEnd
1c ntdll!NtCreateFile



In response the service sent an IOCTL to the filter to create a section( i.e. a mapped file) for data scan

0b mup!MupStateMachine
0c mup!MupFsControl
0d FLTMGR!FltpLegacyProcessingAfterPreCallbacksCompleted
0e FLTMGR!FltPerformSynchronousIo
0f FLTMGR!IssueControlOperation
10 FLTMGR!FltFsControlFile
11 FLTMGR!FltpSetPurgeFailureMode
12 FLTMGR!FltCreateSectionForDataScan
13 WdFilter!MpCreateSection
14 WdFilter!MpMessage
15 FLTMGR!FltpFilterMessage
16 FLTMGR!FltpMsgDispatch
17 FLTMGR!FltpDispatch
21 nt!IopSynchronousServiceTail
22 nt!IopXxxControlFile
23 nt!NtDeviceIoControlFile
24 nt!KiSystemServiceCopyEnd
25 ntdll!NtDeviceIoControlFile


Sunday, January 8, 2017

Setting Irp->UserIosb for unsuccessful requests.

Just for the record.

Irp->IoStatus is not copied to Irp->UserIosb by the special kernel mode APC , i.e.  IopCompleteRequest, on Irp completion if NT_ERROR(Irp->IoStatus.Status) is true and the Irp is synchronous or has not been made pending. This is important when returning any information in Irp->IoStatus.Information for unsuccessful requests when Irp->Flags doesn't have the IRP_BUFFERED_IO flag set. To indicate that the data has not been returned and provide an additional information in Irp->UserIosb.Information use a special status like STATUS_BUFFER_OVERFLOW which is not an error code.  If the IRP_BUFFERED_IO flag is set you can't use the Information field for an unsuccessful request as the system will try to copy data from Irp->AssociatedIrp.SystemBuffer to Irp->UserBuffer in case of NT_ERROR(Irp->IoStatus.Status) is not being true.

Wednesday, December 28, 2016

ExInterlockedPopEntrySList processing by scheduler.

I believe this topic on ExInterlockedPopEntrySList might be interesting for Windows drivers developers.

Safety of using ExInterlockedPopEntrySList

The question was

To my knowledge, pre-Windows 8 x64 implementations of SList use 9-bit sequence numbers in the SLIST_HEADER. This means that 512 operations can complete concurrently (without progress from particular thread) until an ABA problem potentially manifests. I wonder whether, depending on the number of threads and physical cores, this couldn't plausibly occur. To further complicate, the kernel could run on a vcpu, creating time discontinuities. I would like to ask: 1. Does the Windows scheduler protect against ABA by, e.g., restarting interlocked operation upon preemption? 2. Is there some protection against hypervisor interference? 3. In the light of the above concerns, is SList on a pre-Windows 8 x64 deployment really safe for all workloads? I would have speculated that per-thread kernel allocator behavior was factored in for the ABA avoidance, but the primitives are in the Win32 API as well and any driver can employ custom pool allocator.
My answer was

I looked at the code again and found that interrupt processing code has a fixup for SList . There is a routine KiCheckForSListAddress. This routine is called at DISPATCH_LEVEL before returning from an interrupt and it fixes the EIP(RIP for x64) of a trap frame to restart SList pop operation if interrupt happened inside ExInterlockedPopEntrySList. So when an interrupt processing code returns execution to an interrupted code the code resumes at the beginning of ExInterlockedPopEntrySList ( namely ExpInterlockedPopEntrySListResume ). kd&gt; uf KiCheckForSListAddress nt!KiCheckForSListAddress: 82acbdf1 0fb7416c movzx eax,word ptr [ecx+6Ch] 82acbdf5 8b5168 mov edx,dword ptr [ecx+68h] 82acbdf8 6683f808 cmp ax,8 82acbdfc 7511 jne nt!KiCheckForSListAddress+0x1e (82acbe0f) Branch nt!KiCheckForSListAddress+0xd: 82acbdfe b8f4dda882 mov eax,offset nt!ExpInterlockedPopEntrySListResume (82a8ddf4) 82acbe03 3bd0 cmp edx,eax 82acbe05 7222 jb nt!KiCheckForSListAddress+0x38 (82acbe29) Branch nt!KiCheckForSListAddress+0x16: 82acbe07 81fa1fdea882 cmp edx,offset nt!ExpInterlockedPopEntrySListEnd (82a8de1f) 82acbe0d eb15 jmp nt!KiCheckForSListAddress+0x33 (82acbe24) Branch nt!KiCheckForSListAddress+0x1e: 82acbe0f 6683f81b cmp ax,1Bh 82acbe13 7514 jne nt!KiCheckForSListAddress+0x38 (82acbe29) Branch nt!KiCheckForSListAddress+0x24: 82acbe15 a1ac69bb82 mov eax,dword ptr [nt!KeUserPopEntrySListResume (82bb69ac)] 82acbe1a 3bd0 cmp edx,eax 82acbe1c 720b jb nt!KiCheckForSListAddress+0x38 (82acbe29) Branch nt!KiCheckForSListAddress+0x2d: 82acbe1e 3b15a469bb82 cmp edx,dword ptr [nt!KeUserPopEntrySListEnd (82bb69a4)] nt!KiCheckForSListAddress+0x33: 82acbe24 7703 ja nt!KiCheckForSListAddress+0x38 (82acbe29) Branch nt!KiCheckForSListAddress+0x35: 82acbe26 894168 mov dword ptr [ecx+68h],eax nt!KiCheckForSListAddress+0x38: 82acbe29 c3 ret Branch

Wednesday, October 12, 2016

Windows and Linux kernels exception handling and stack unwinding

The interesting difference between Windows and Linux kernels is in Windows mechanism to unwind a call stack, aka Frame Unwind. Windows 64 bit and Linux kernels use the table based exception processing to locate a handler for an instruction that caused an exception. Windows kernel can unwind a call stack to locate a caller's handler while Linux requires to have a table entry for each executable address range that can cause an exception.

You can look at pseudo-code for Windows 64 bit RtlUnwind here StackWalk64.cpp .

Some resources on Windows 64 bit SEH implementation.

1. Exceptional behavior: the Windows 8.1 X64 SEH Implementation  http://blog.talosintel.com/2014/06/exceptional-behavior-windows-81-x64-seh.html

2. Exceptional Behavior - x64 Structured Exception Handling - OSR Online. http://www.osronline.com/article.cfm?article=469

3. Johnson, Ken. " Programming against the x64 exception handling support ."  http://www.nynaeve.net/?p=113

The code was borrowed from http://www.nynaeve.net/Code/StackWalk64.cpp

__declspec(noinline)
VOID
StackTrace64(
 VOID
 )
{
 CONTEXT                       Context;
 KNONVOLATILE_CONTEXT_POINTERS NvContext;
 UNWIND_HISTORY_TABLE          UnwindHistoryTable;
 PRUNTIME_FUNCTION             RuntimeFunction;
 PVOID                         HandlerData;
 ULONG64                       EstablisherFrame;
 ULONG64                       ImageBase;

 DbgPrint("StackTrace64: Executing stack trace...\n");

 //
 // First, we'll get the caller's context.
 //

 RtlCaptureContext(&Context);

 //
 // Initialize the (optional) unwind history table.
 //

 RtlZeroMemory(
  &UnwindHistoryTable,
  sizeof(UNWIND_HISTORY_TABLE));

 UnwindHistoryTable.Unwind = TRUE;

 //
 // This unwind loop intentionally skips the first call frame, as it shall
 // correspond to the call to StackTrace64, which we aren't interested in.
 //

 for (ULONG Frame = 0;
   ;
   Frame++)
 {
  //
  // Try to look up unwind metadata for the current function.
  //

  RuntimeFunction = RtlLookupFunctionEntry(
   Context.Rip,
   &ImageBase,
   &UnwindHistoryTable
   );

  RtlZeroMemory(
   &NvContext,
   sizeof(KNONVOLATILE_CONTEXT_POINTERS));

  if (!RuntimeFunction)
  {
   //
   // If we don't have a RUNTIME_FUNCTION, then we've encountered
   // a leaf function.  Adjust the stack approprately.
   //

   Context.Rip  = (ULONG64)(*(PULONG64)Context.Rsp);
   Context.Rsp += 8;
  }
  else
  {
   //
   // Otherwise, call upon RtlVirtualUnwind to execute the unwind for
   // us.
   //

   RtlVirtualUnwind(
    UNW_FLAG_NHANDLER,
    ImageBase,
    Context.Rip,
    RuntimeFunction,
    &Context,
    &HandlerData,
    &EstablisherFrame,
    &NvContext);
  }

  //
  // If we reach an RIP of zero, this means that we've walked off the end
  // of the call stack and are done.
  //

  if (!Context.Rip)
   break;

  //
  // Display the context.  Note that we don't bother showing the XMM
  // context, although we have the nonvolatile portion of it.
  //

  DbgPrint(
   "FRAME %02x: Rip=%p Rsp=%p Rbp=%p\n",
   Frame,
   Context.Rip,
   Context.Rsp,
   Context.Rsp);
  DbgPrint(
   "r12=%p r13=%p r14=%p\n"
   "rdi=%p rsi=%p rbx=%p\n"
   "rbp=%p rsp=%p\n",
   Context.R12,
   Context.R13,
   Context.R14,
   Context.Rdi,
   Context.Rsi,
   Context.Rbx,
   Context.Rbp,
   Context.Rsp
   );

  static const CHAR* RegNames[ 16 ] =
  { "Rax", "Rcx", "Rdx", "Rbx", "Rsp", "Rbp", "Rsi", "Rdi", "R8", "R9",
    "R10", "R11", "R12", "R13", "R14", "R15" };

  //
  // If we have stack-based register stores, then display them here.
  //

  for (ULONG i = 0;
    i < 16;
    i++)
  {
   if (NvContext.IntegerContext[ i ])
   {
    DbgPrint(
     " -> Saved register '%s' on stack at %p (=> %p)\n",
     RegNames[ i ],
     NvContext.IntegerContext[ i ],
     *NvContext.IntegerContext[ i ]);
   }
  }

  DbgPrint("\n");
 }

 DbgBreakPoint();

 return;
}

Tuesday, September 6, 2016

Waiting for concurrent page fault completion

An interesting call stack when a thread waits in a page fault for another thread completing paging data from a file

00 nt!KiSwapContext
01 nt!KiSwapThread
02 nt!KiCommitThreadWait
03 nt!KeWaitForSingleObject
04 nt!MiWaitForCollidedFaultComplete
05 nt!MiResolveTransitionFault
06 nt!MiResolveProtoPteFault
07 nt!MiDispatchFault
08 nt!MmAccessFault
09 nt!KiPageFault
0a nt!memcpy
0b nt!CcCopyBytesToUserBuffer
0c nt!CcMapAndCopyFromCache
0d nt!CcCopyReadEx
0e nt!CcCopyRead
0f nt!FsRtlCopyRead
10 ***
11 ***
12 ***
13 nt!NtReadFile
14 nt!KiSystemServiceCopyEnd

Friday, September 2, 2016

FileObjects and SectionObjectPointer in Windows.

Just for the record.

FileObject->SectionObjectPointer is allocated and set by a file system driver but the structure is managed by the Memory Manager (Mm). SectionObjectPointer is shared between all file objects for the same data stream.

FileObject->SectionObjectPointer->DataSectionObject and FileObject->SectionObjectPointer->ImageSectionObject contain address of ControlArea for data and image.

ControlArea deletion is synchronized by ControlArea->WaitingForDeletion and ControlArea->u.Flags.BeingDeleted. WaitingForDeletion points to a structure with notification event and a reference counter.

All functions that might destroy control area take SectionObjectPointer as a parameter. These functions acquire a global lock then check that ControlArea is not NULL. If control area exists ControlArea->u.Flags.BeingDeleted is checked and if it is set a function waits on WaitingForDeletion event with incremented reference counter so the event is deleted when the last waiting thread exit from a waiting state and the reference counter drops to zero. A call to MiCleanSection set SectionObjectPointer->DataSectionObject  and  SectionObjectPointer->ImageSectionObject  to NULL. This call is synchronized with ControlArea->u.Flags.BeingDeleted.

The functions that might delete control area include MmFlushImageSection and CcPurgeCacheSection. That means that it is safe to provide SectionObjectPointer to these functions without synchronizing with file objects deletion. It is even possible to call this functions with a SectionObjectPointer when all related file objects have been deleted or have IopDeleteFile being called for them which might happen in IRP_MJ_PNP processing path.

Friday, June 17, 2016

Caching and file object reference in Windows.

This is how the last reference to a file object backing cached file data is being released by the kernel. In that case this was a network filesystem

19 nt!IofCallDriver
1a mup!MupiCallUncProvider
1b mup!MupStateMachine
1c mup!MupClose
1d nt!IofCallDriver
1e nt!IopDeleteFilec
1f nt!ObpRemoveObjectRoutine
20 nt!ObfDereferenceObjectWithTag
21 nt!ObfDereferenceObject
22 nt!CcDeleteSharedCacheMap
23 nt!CcWriteBehind
24 nt!CcWorkerThread
25 nt!ExpWorkerThread
26 nt!PspSystemThreadStartup
27 nt!KiThreadStartup

Friday, June 10, 2016

How handles are closed on process termination in Windows

Just for curiosity. A call stack when handles are closed on process termination

00 nt!ObpDecrementHandleCount
01 nt!ObpCloseHandleTableEntry
02 nt!ExSweepHandleTable
03 nt!ObKillProcess
04 nt!PspExitThread
05 nt!PsExitSpecialApc
06 nt!KiDeliverApc
07 nt!KiServiceExit
08 ntdll!KiFastSystemCallRet
09 ntdll!ZwWaitForWorkViaWorkerFactory
0a ntdll!TppWorkerThread
0b KERNEL32!BaseThreadInitThunk
0c ntdll!__RtlUserThreadStart
0d ntdll!_RtlUserThreadStart

Monday, May 23, 2016

ZwQuerySystemInformation fails for SystemSessionProcessesInformation(53) when called from a driver

The following kernel mode code will always fail with STATUS_ACCESS_DENIED ( C0000005 ) error if used with a well known definition for SYSTEM_SESSION_PROCESS_INFORMATION.

typedef struct _SYSTEM_SESSION_PROCESS_INFORMATION {
    ULONG SessionId;
    ULONG SizeOfBuf;
    PVOID Buffer;
} SYSTEM_SESSION_PROCESS_INFORMATION, *PSYSTEM_SESSION_PROCESS_INFORMATION;

SYSTEM_SESSION_PROCESS_INFORMATION     Info;

Info.SessionId = SessionId;
Info.Buffer = Buffer; // a buffer allocated in the system space
Info.SizeOfBuf = SizeOfBuf;

RC = ZwQuerySystemInformation( SystemSessionProcessesInformation, &Info, sizeof(Info), &ReturnedLength );


I disassembled the sequence of calls until an error was returned. The reason for failure is that the definition for SYSTEM_SESSION_PROCESS_INFORMATION has probably changed starting from Vista. The kernel checks the size of the structure. The size is a third parameter for ZwQuerySystemInformation. If the size is 0x10(on 64 bit system) ExpQuerySystemInformation calls ProbeForWrite for Info.Buffer regardless of the previous mode ( in this case the previous mode was KernelMode ). Obviously the system allows to use the old definition only for user mode code as ProbeForWrite always throws an exception ( SEH ) when called with a kernel mode address as a parameter.

Below is a call stack when ProbeForWrite is called

nt!ProbeForWrite
nt!ExpQuerySystemInformation
nt!NtQuerySystemInformation
nt!KiSystemServiceCopyEnd
nt!KiServiceLinkage
<a call to ZwQuerySystemInformation from a kernel mode driver>

Thursday, March 24, 2016

What happens with outstanding IRPs when a process terminates.

When Windows kernel terminates a process it inserts APC in each thread, in turn this APC calls PspExitThread that calls IoCancelThreadIo to cancel if possible all outstanding IRPs by calling IoCancelIrp and waits for IRP cancelation or completion. A thread waits only for IRPs that have been associated with a thread by calling IopQueueThreadIrp that adds IRP in a list of IRPs associated with a thread, the list head is IrpList field of the ETHREAD structure.

A thread will be blocked until any of the two conditions takes place
 - IrpList becomes empty, that means all outstanding IRPs completed in a normal way or were cancelled
 - 5 minutes timeout expired, in that case IopDisassociateThreadIrp is called to perform IRPs disassociation by removing them from IrpList and setting IRP->Tail.Overlay.Thread to NULL

Below is a call stack for a terminating thread with four outstanding IRPs ( marked yellow ).

        THREAD 870788e8  Cid 1100.0f5c  Teb: 7ffab000 Win32Thread: 00000000 WAIT: (DelayExecution) KernelMode Non-Alertable
            86d29460  SynchronizationEvent
        IRP List:
            87305dc8: (0006,0100) Flags: 00060a00  Mdl: 00000000
            86ecd6f8: (0006,0100) Flags: 00060a00  Mdl: 00000000
            862fd7b8: (0006,0100) Flags: 00060a00  Mdl: 00000000
            87221d80: (0006,0100) Flags: 00060a00  Mdl: 00000000
        Not impersonating
        DeviceMap                 975b0820
        Owning Process            8514a030       Image:         explorer.exe
        Attached Process          N/A            Image:         N/A
        Wait Start TickCount      22339          Ticks: 1 (0:00:00:00.015)
        Context Switch Count      2734           IdealProcessor: 1          
        UserTime                  00:00:00.000
        KernelTime                00:00:00.031
        Win32 Start Address 0x769842ed
        Stack Init a86bfed0 Current a86bfa38 Base a86c0000 Limit a86bd000 Call 0
        Priority 10 BasePriority 8 UnusualBoost 0 ForegroundBoost 2 IoPriority 2 PagePriority 2
        ChildEBP RetAddr
        a86bfa50 82ad269d nt!KiSwapContext+0x26 (FPO: [Uses EBP] [0,0,4])
        a86bfa88 82ad14f7 nt!KiSwapThread+0x266
        a86bfab0 82ad11d5 nt!KiCommitThreadWait+0x1df
        a86bfb0c 82cb9171 nt!KeDelayExecutionThread+0x2aa
        a86bfb40 82cbe519 nt!IoCancelThreadIo+0x70
        a86bfbb4 82cd2051 nt!PspExitThread+0x48e
        a86bfbcc 82b058c0 nt!PsExitSpecialApc+0x22
        a86bfc1c 82a922a4 nt!KiDeliverApc+0x28b
        a86bfc1c 778770b4 nt!KiServiceExit+0x64 (FPO: [0,3] TrapFrame @ a86bfc34)
        074fe3e4 00000000 ntdll!KiFastSystemCallRet (FPO: [0,0,0])

A main process thread waits for child threads termination.

        THREAD 86e307f0  Cid 1100.1104  Teb: 7ffdf000 Win32Thread: fe9c8a88 WAIT: (Executive) KernelMode Non-Alertable
            870788e8  Thread
        Not impersonating
        DeviceMap                 975b0820
        Owning Process            8514a030       Image:         explorer.exe
        Attached Process          N/A            Image:         N/A
        Wait Start TickCount      21665          Ticks: 675 (0:00:00:10.530)
        Context Switch Count      15565          IdealProcessor: 1          
        UserTime                  00:00:00.249
        KernelTime                00:00:00.530
        Win32 Start Address 0x00e50efa
        Stack Init a87b5ed0 Current a87b5a38 Base a87b6000 Limit a87b3000 Call 4f0
        Priority 12 BasePriority 8 UnusualBoost 0 ForegroundBoost 2 IoPriority 2 PagePriority 5

        ChildEBP RetAddr
        a87b5a50 82ad269d nt!KiSwapContext+0x26 (FPO: [Uses EBP] [0,0,4])
        a87b5a88 82ad14f7 nt!KiSwapThread+0x266
        a87b5ab0 82acb0cf nt!KiCommitThreadWait+0x1df
        a87b5b2c 82cbe28e nt!KeWaitForSingleObject+0x393
        a87b5bb4 82cd2051 nt!PspExitThread+0x203
        a87b5bcc 82b058c0 nt!PsExitSpecialApc+0x22
        a87b5c1c 82a922a4 nt!KiDeliverApc+0x28b
        a87b5c1c 77876fc0 nt!KiServiceExit+0x64 (FPO: [0,3] TrapFrame @ a87b5c34)
        000ffb18 00000000 ntdll!KiUserCallbackDispatcher (FPO: [0,0,0])

Tuesday, March 1, 2016

A case of successful registration of an incorrectly defined file system minifilter.

 An interesting observation. If you forget to terminate FLT_OPERATION_REGISTRATION array with IRP_MJ_OPERATION_END then no instances will be attached but a minifilter is successfully registered and InstanceSetup callback is called. No any error is reported. Just yet another case of a closed source Microsoft subsystem with inconsistent behavior when you can spent hours chasing a bug by trial and error approach instead of looking at source code.

Friday, February 5, 2016

Unsafe use of FltGetFileNameInformationUnsafe

  FltGetFileNameInformationUnsafe name is spot on. It is very unsafe and dangerous function but not in the way mentioned in WDK. The documentation misses yet another dangerous case when it should not be used. This case is related to file system isolation filters( for more information on such filters follow this link https://www.osronline.com/article.cfm?article=560 ). I would state this additional condition as

  - A filter driver must not use FltGetFileNameInformationUnsafe for file objects it never observed. A filter observed a file object if it was called at least once with this file object for any operation and it never received IRP_MJ_CLOSE for this file object or observed a failed IRP_MJ_CREATE. This condition guaranties that a file object is initialized by a filter of file system driver that is beneath a filter calling FltGetFileNameInformationUnsafe . 

  When the above condition is violated by any file system filter a filter might call an underlying file system with a file object this file system never initialized as the file object was initialized by a file system filter that is above a filter that has called FltGetFileNameInformationUnsafe . At first glance it seems that in that case an upper filter must isolate the underlying filters from such file object. This is true, but there is a degenerate case when a filter can gain access to file objects that it should never see. This is a case of a process creation callback when a file object for executable file is provided as a parameter to a callback and this file object might have been initialized by a file system filter above one that registered this callback. Below is a call stack from the system where Microsoft Windows Defender file system minifilter used FltGetFileNameInformationUnsafe from a process creation callback to query a file name for a file object initialized by a file system filter attached to MiniFilter Manager ( aka fltmgr.sys )


Ntfs!NtfsCommonQueryInformation+0xa7
Ntfs!NtfsFsdDispatchSwitch+0xd3
Ntfs!NtfsFsdDispatchWait+0x47
nt!IovCallDriver+0x3cd
fltmgr!FltpQueryInformationFile+0x10e
fltmgr!QueryStandardLinkInformation+0x4d
fltmgr!FltpSetStreamListStandardInformationFlags+0x7a
fltmgr!FltpGetFileNameInformation+0x796
fltmgr!FltGetFileNameInformationUnsafe+0x71
WdFilter!MpGetImageNormalizedName+0x51
WdFilter!MpCreateProcessNotifyRoutineEx+0x77
nt!PspInsertThread+0x7a7
nt!NtCreateUserProcess+0x806
nt!KiSystemServiceCopyEnd+0x13

As a result NTFS crashed when it received an IRP with a file object initialized by a file system filter attached to the top of the drivers stack. The obvious solution is to use the old good ObQueryNameString as it has been done for the last 25 years by the kernel itself, for example in NtQueryVirtualMemory when processing a memory region supported by a mapped file. In that case an IRP is created and sent from the very top of a drivers stack so an isolation filter has a chance to intercept this IRP and complete it.

Tuesday, January 5, 2016

On importance of FileObject->Vpb pointer

 An easily overlooked issue in FS(file systems) and FS filters development for Windows is Vpb pointer in FILE_OBJECT structure, i.e. Volume Parameters Block ( struct VPB ). It is responsibility of FS or FS filter ( in case of stacked FS, i.e. FSD-over-FSD ) to initialize this pointer to a correct value. It is especially important in case of FSD over FSD implementations as some underlying FSs check the VPB->ReferenceCount pointer on their PnP path and both upper FSD and lower FSD share the same VPB pointer. The kernel references Vpb in IopParseDevice routine that calls IRP_MJ_CREATE dispatch routine, but doesn't set Vpb pointer for FileObject sent with create Irp, and dereferences Vpb in IopDeleteFile just after IRP_MJ_CLOSE completes but IopDeleteFile fetches the pointer from FileObject->Vpb that is set by FS when processing IRP_MJ_CREATE. If FS or FS filter fails to set Vpb pointer PnP safe remove for some file systems would be impossible as Vpb->ReferenceCount never drops below a checked value.

 I encountered this problem when working on my own implementation for FSD-over-FSD FS filter. The problem manifested itself on FAT32 , NTFS was clean from this issue.

 Below are two call stacks for 32 bit Windows 8, the first is for a case when Vpb->ReferenceCount is bumped and the second for a case when it is decremented.


nt!IopCheckVpbMounted+0x81
nt!IopParseDevice+0x48b
nt!ObpLookupObjectName+0x6ef
nt!ObOpenObjectByName+0x1e3
nt!IopCreateFile+0x372
nt!NtCreateFile+0x78
nt!KiSystemServiceCopyEnd+0x13
ntdll!NtCreateFile+0xa


nt!IopDecrementVpbRefCount+0x5b
nt!IopDeleteFile+0xf5
nt!ObpRemoveObjectRoutine+0x64
nt!ObfDereferenceObjectWithTag+0x8f
nt!NtClose+0x210
nt!KiSystemServiceCopyEnd+0x13


Tuesday, May 5, 2015

Getting an object type on Windows 10 Technical Preview Build 10074

   Windows 10 Technical Preview Build 10074 came with a surprise. A bit of history - Windows 7 introduced a new way for retrieving an object type by object address, the object type pointer Type in OBJECT_HEADER was replaced with the TypeIndex which is an index in ObTypeIndexTable, this saved 3 ( 32 bit) or 7 (on 64 bit) bytes compared to a pointer. Windows 10 Build 10074 added a new feature, the TypeIndex value is not an index but a result of a binary operation between an index in ObTypeIndexTable, the second lowest byte of the object address and a value from ObHeaderCookie. The actual reason of this is not yet clear for me but it looks like an attempt to reduce an inter CPU cache coherency traffic by spreading the ObTypeIndexTable to contain copies of the object types and multiplexing access based on the object address. The exported ObGetObjectType function can be used to retrieve an object type address. Lets take a look on ObGetObjectType.

nt!ObGetObjectType:
lea         rax,[rcx-30h]
movzx   ecx,byte ptr [rcx-18h]
shr        rax,8
movzx   eax,al
xor        rax,rcx
movzx   ecx,byte ptr [nt!ObHeaderCookie (fffff802`eae3d42c)]
xor        rax,rcx
lea         rcx,[nt!ObTypeIndexTable (fffff802`eae3d8e0)]
mov       rax,qword ptr [rcx+rax*8]
ret

which can be written in C as ( where XOR(a,b) is a^b )

POBJECT_TYPE
ObGetObjectType( __in PVOID Object )
{
POBJECT_HEADER   Header = GET_OBJECT_HEADER( Object );
UCHAR    Index = XOR( Header->TypeIndex, (UCHAR)(Header>>8) );
       UCHAR    Cookie= *(PUCHAR)ObHeaderCookie;

        return  ObTypeIndexTable[ XOR(Index, Cookie) ];
}