Saturday, July 29, 2017

Windows developers' misconception about UNIX.

While reading forum on Windows file system development I ran into a common misconception among Windows developers regarding UNIX design.
The essential difference between how the NT kernel works and how Unix was
designed is that NT caches streams of data (above the file system), whereas
on Unix data is cached at the block layer.
I spent 5 minutes to bust it.
This is true only for ancient *NIX kernels. Modern kernels use the same technique as NT with caching backed by file mapping structures.
For example below is a call stack from my test machine running the Linux kernel (4.12.2) when ext4 read operation (ext4_file_read_iter) called the "Linux cache manager" ( do_generic_file_read -> page_cache_sync_readahead ) to bring data in the cache backed by mapped file structures( struct address_space ) when processing the read() system call.
This resulted in a recursive call to mapping->a_ops->readpages into a file system's ext4_readpages . This is an analogue of a cached read in NT. Mac OS X uses the same caching by file mapping technique borrowed from BSD.
(gdb) bt
#0  ext4_readpages (file=0xffff88001d59b300, mapping=0xffff88001d1d56c0, pages=0xffffc90000817c30, nr_pages=1) at ../fs/ext4/inode.c:3308
#1  0xffffffff811b6288 in read_pages (gfp=<optimised out>, nr_pages=<optimised out>, pages=<optimised out>, filp=<optimised out>, mapping=<optimised out>) at ../mm/readahead.c:121
#2  __do_page_cache_readahead (mapping=<optimised out>, filp=<optimised out>, offset=1, nr_to_read=<optimised out>, lookahead_size=<optimised out>) at ../mm/readahead.c:199
#3  0xffffffff811b64b8 in ra_submit (ra=<optimised out>, ra=<optimised out>, ra=<optimised out>, filp=<optimised out>, mapping=<optimised out>) at ../mm/internal.h:66
#4  ondemand_readahead (mapping=0xffff88001d1d56c0, ra=0xffff88001d59b398, filp=0xffff88001d59b300, hit_readahead_marker=<optimised out>, offset=0, req_size=<optimised out>) at ../mm/readahead.c:478
#5  0xffffffff811b678e in page_cache_sync_readahead (mapping=<optimised out>, ra=<optimised out>, filp=<optimised out>, offset=<optimised out>, req_size=<optimised out>) at ../mm/readahead.c:510
#6  0xffffffff811a7a62 in do_generic_file_read (written=<optimised out>, iter=<optimised out>, ppos=<optimised out>, filp=<optimised out>) at ../mm/filemap.c:1813
#7  generic_file_read_iter (iocb=0x20000, iter=<optimised out>) at ../mm/filemap.c:2069
#8  0xffffffff812d1386 in ext4_file_read_iter (iocb=0xffff88001d59b300, to=0xffff88001d1d56c0) at ../fs/ext4/file.c:70
#9  0xffffffff81237680 in call_read_iter (file=<optimised out>, iter=<optimised out>, kio=<optimised out>) at ../include/linux/fs.h:1728
#10 new_sync_read (ppos=<optimised out>, len=<optimised out>, buf=<optimised out>, filp=<optimised out>) at ../fs/read_write.c:440
#11 __vfs_read (file=0xffff88001d59b300, buf=<optimised out>, count=<optimised out>, pos=0xffffc90000817f18) at ../fs/read_write.c:452
#12 0xffffffff81237cc3 in vfs_read (file=0xffff88001d59b300, buf=0x7fb92a0cb000 <error: Cannot access memory at address 0x7fb92a0cb000>, count=<optimised out>, pos=0xffffc90000817f18)
    at ../fs/read_write.c:473
#13 0xffffffff81239385 in SYSC_read (count=<optimised out>, buf=<optimised out>, fd=<optimised out>) at ../fs/read_write.c:589
#14 SyS_read (fd=<optimised out>, buf=140433251151872, count=131072) at ../fs/read_write.c:582
#15 0xffffffff818aaffb in entry_SYSCALL_64 () at ../arch/x86/entry/entry_64.S:203

(gdb) f 4
#4  ondemand_readahead (mapping=0xffff88001d1d56c0, ra=0xffff88001d59b398, filp=0xffff88001d59b300, hit_readahead_marker=<optimised out>, offset=0, req_size=<optimised out>) at ../mm/readahead.c:478
478  return ra_submit(ra, mapping, filp);

(gdb) p/x *mapping
$14 = {host = 0xffff88001d1d5548, page_tree = {gfp_mask = 0x1180020, rnode = 0x0}, tree_lock = {{rlock = {raw_lock = {val = {counter = 0x0}}}}}, i_mmap_writable = {counter = 0x0}, i_mmap = {
    rb_node = 0x0}, i_mmap_rwsem = {count = {counter = 0x0}, wait_list = {next = 0xffff88001d1d56f0, prev = 0xffff88001d1d56f0}, wait_lock = {raw_lock = {val = {counter = 0x0}}}, osq = {tail = {
        counter = 0x0}}, owner = 0x0}, nrpages = 0x0, nrexceptional = 0x0, writeback_index = 0x0, a_ops = 0xffffffff81a3a680, flags = 0x0, private_lock = {{rlock = {raw_lock = {val = {
            counter = 0x0}}}}}, gfp_mask = 0x14200ca, private_list = {next = 0xffff88001d1d5740, prev = 0xffff88001d1d5740}, private_data = 0x0}
(gdb) ptype mapping
type = struct address_space {
    struct inode *host;
    struct radix_tree_root page_tree;
    spinlock_t tree_lock;
    atomic_t i_mmap_writable;
    struct rb_root i_mmap;
    struct rw_semaphore i_mmap_rwsem;
    unsigned long nrpages;
    unsigned long nrexceptional;
    unsigned long writeback_index;
    const struct address_space_operations *a_ops;
    unsigned long flags;
    spinlock_t private_lock;
    gfp_t gfp_mask;
    struct list_head private_list;
    void *private_data;
} *

(gdb) f 1
#1  0xffffffff811b6288 in read_pages (gfp=<optimised out>, nr_pages=<optimised out>, pages=<optimised out>, filp=<optimised out>, mapping=<optimised out>) at ../mm/readahead.c:121
121   ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
(gdb) l
116  int ret;
118  blk_start_plug(&plug);
120  if (mapping->a_ops->readpages) {
121   ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
122   /* Clean up the remaining pages */
123   put_pages_list(pages);
124   goto out;
125  }

(gdb) f 9
#9  0xffffffff81237680 in call_read_iter (file=<optimised out>, iter=<optimised out>, kio=<optimised out>) at ../include/linux/fs.h:1728
1728  return file->f_op->read_iter(kio, iter);
(gdb) l
1723 } ____cacheline_aligned;
1725 static inline ssize_t call_read_iter(struct file *file, struct kiocb *kio,
1726          struct iov_iter *iter)
1727 {
1728  return file->f_op->read_iter(kio, iter);
1729 }
1731 static inline ssize_t call_write_iter(struct file *file, struct kiocb *kio,
1732           struct iov_iter *iter)

Monday, July 3, 2017

FltCreateFile and top device.

FltCreateFile calls IoCreateFileEx with IO_DRIVER_CREATE_CONTEXT.DeviceObjectHint pointing to the Filter Manager's filter object and then calls the lower registered filters. That allows the created file object to have TopDeviceObjectHint pointing to the Filter Manager's object.
 # Child-SP          RetAddr           Call Site
00 ffffe101`3fffebf8 fffff801`92125200 FLTMGR!FltpCreate
01 ffffe101`3fffec00 fffff801`9213058b nt!IopParseDevice+0x7f0
02 ffffe101`3fffedd0 fffff801`921340c0 nt!ObpLookupObjectName+0x46b
03 ffffe101`3fffefa0 fffff801`9213803a nt!ObOpenObjectByNameEx+0x1e0
04 ffffe101`3ffff0e0 fffff801`920b0eb4 nt!IopCreateFile+0x3aa
05 ffffe101`3ffff180 fffff808`485240d5 nt!IoCreateFileEx+0x124
06 ffffe101`3ffff210 fffff808`4853d32d FLTMGR!FltpCreateFile+0x1cd
07 ffffe101`3ffff310 fffff808`4b6a79f8 FLTMGR!FltCreateFile+0x8d
08 ffffe101`3ffff3a0 fffff808`484f4b4c avscan!AvPreCreate+0x378 [d:\work\avscan\filter\avscan.c @ 2106]
09 ffffe101`3ffff4b0 fffff808`484f46ec FLTMGR!FltpPerformPreCallbacks+0x2ec
0a ffffe101`3ffff5d0 fffff808`48526117 FLTMGR!FltpPassThroughInternal+0x8c
0b ffffe101`3ffff600 fffff801`92125200 FLTMGR!FltpCreate+0x2d7
0c ffffe101`3ffff6b0 fffff801`9213058b nt!IopParseDevice+0x7f0
0d ffffe101`3ffff880 fffff801`921340c0 nt!ObpLookupObjectName+0x46b
0e ffffe101`3ffffa50 fffff801`920c9e90 nt!ObOpenObjectByNameEx+0x1e0

0: kd> dt nt!_FILE_OBJECT ffff948c621a3330
   +0x000 Type             : 0n5
   +0x002 Size             : 0n216
   +0x008 DeviceObject     : 0xffff948c`60a3bc80 _DEVICE_OBJECT
   +0x010 Vpb              : 0xffff948c`60a556e0 _VPB
   +0x018 FsContext        : 0xffff948c`6111a740 Void
   +0x020 FsContext2       : 0xffff8483`76cf78f0 Void
   +0x028 SectionObjectPointer : (null) 
   +0x030 PrivateCacheMap  : (null) 
   +0x038 FinalStatus      : 0n0
   +0x040 RelatedFileObject : (null) 
   +0x048 LockOperation    : 0 ''
   +0x049 DeletePending    : 0 ''
   +0x04a ReadAccess       : 0x1 ''
   +0x04b WriteAccess      : 0 ''
   +0x04c DeleteAccess     : 0 ''
   +0x04d SharedRead       : 0x1 ''
   +0x04e SharedWrite      : 0x1 ''
   +0x04f SharedDelete     : 0x1 ''
   +0x050 Flags            : 0x40000
   +0x058 FileName         : _UNICODE_STRING "\"
   +0x068 CurrentByteOffset : _LARGE_INTEGER 0x0
   +0x070 Waiters          : 0
   +0x074 Busy             : 0
   +0x078 LastLock         : (null) 
   +0x080 Lock             : _KEVENT
   +0x098 Event            : _KEVENT
   +0x0b0 CompletionContext : (null) 
   +0x0b8 IrpListLock      : 0
   +0x0c0 IrpList          : _LIST_ENTRY [ 0xffff948c`621a33f0 - 0xffff948c`621a33f0 ]
   +0x0d0 FileObjectExtension : 0xffff948c`6226f1b0 Void
0: kd> dq 0xffff948c`6226f1b0
ffff948c`6226f1b0  00000000`00000000 00000000`00000000
ffff948c`6226f1c0  ffff948c`60dff0d0 00000000`00000000
ffff948c`6226f1d0  ffff948c`6243f2c0 00000000`00000000
ffff948c`6226f1e0  00000000`00000000 00000000`00000000
ffff948c`6226f1f0  00000000`00000000 00000000`00000000
ffff948c`6226f200  61436d4d`02120006 00000000`0000034c
ffff948c`6226f210  ffff8483`7535ed10 ffff948c`62161e28
ffff948c`6226f220  ffff948c`6221da78 00000000`00000000

0: kd> dq ffff948c`60dff0d0
ffff948c`60dff0d0  ffff948c`610734a0 00000000`00000000
ffff948c`60dff0e0  00000000`00000000 00000000`00000000
ffff948c`60dff0f0  65536d4d`02060003 6c8da38a`069a7123
ffff948c`60dff100  00000000`00000000 0000024e`49c8000a
ffff948c`60dff110  0000024e`49c80fff 00000000`00000000
ffff948c`60dff120  00000000`00000000 00000000`00000000
ffff948c`60dff130  00000000`00000000 00000000`00000000
ffff948c`60dff140  00000000`00000002 00000000`00000000

0: kd> !object ffff948c`610734a0
Object: ffff948c610734a0  Type: (ffff948c5e34eb00) Device
    ObjectHeader: ffff948c61073470 (new version)
    HandleCount: 0  PointerCount: 1
0: kd> !devstack ffff948c610734a0
  !DevObj           !DrvObj            !DevExt           ObjectName
> ffff948c610734a0  \FileSystem\FltMgr ffff948c610735f0  
  ffff948c61048060  \FileSystem\fastfatffff948c610481b0 
0: kd> !vpb 0xffff948c`60a556e0
Vpb at 0xffff948c60a556e0
Flags: 0x1 mounted 
DeviceObject: 0xffff948c61048060
RealDevice:   0xffff948c60a3bc80
RefCount: 8
Volume Label: