25 files changed, 1245 insertions, 574 deletions
diff --git a/programs/Makefile b/programs/Makefile
index 8641d0ee48a0..f77e1b7f10f8 100644
--- a/programs/Makefile
+++ b/programs/Makefile
@@ -1,5 +1,5 @@
 # ################################################################
-# Copyright (c) 2015-2020, Yann Collet, Facebook, Inc.
+# Copyright (c) Yann Collet, Facebook, Inc.
 # All rights reserved.
 #
 # This source code is licensed under both the BSD-style license (found in the
@@ -9,7 +9,7 @@
 # ##########################################################################
 # zstd : Command Line Utility, supporting gzip-like arguments
 # zstd32 : Same as zstd, but forced to compile in 32-bits mode
-# zstd_nolegacy : zstd without support of decompression of legacy versions
+# zstd-nolegacy : zstd without support of decompression of legacy versions
 # zstd-small : minimal zstd without dictionary builder and benchmark
 # zstd-compress : compressor-only version of zstd
 # zstd-decompress : decompressor-only version of zstd
@@ -18,31 +18,9 @@
 .PHONY: default
 default: zstd-release
 
-# silent mode by default; verbose can be triggered by V=1 or VERBOSE=1
-$(V)$(VERBOSE).SILENT:
+LIBZSTD := ../lib
 
-
-ZSTDDIR := ../lib
-
-# Version numbers
-LIBVER_SRC := $(ZSTDDIR)/zstd.h
-LIBVER_MAJOR_SCRIPT:=`sed -n '/define ZSTD_VERSION_MAJOR/s/.*[[:blank:]]\([0-9][0-9]*\).*/\1/p' < $(LIBVER_SRC)`
-LIBVER_MINOR_SCRIPT:=`sed -n '/define ZSTD_VERSION_MINOR/s/.*[[:blank:]]\([0-9][0-9]*\).*/\1/p' < $(LIBVER_SRC)`
-LIBVER_PATCH_SCRIPT:=`sed -n '/define ZSTD_VERSION_RELEASE/s/.*[[:blank:]]\([0-9][0-9]*\).*/\1/p' < $(LIBVER_SRC)`
-LIBVER_SCRIPT:= $(LIBVER_MAJOR_SCRIPT).$(LIBVER_MINOR_SCRIPT).$(LIBVER_PATCH_SCRIPT)
-LIBVER_MAJOR := $(shell echo $(LIBVER_MAJOR_SCRIPT))
-LIBVER_MINOR := $(shell echo $(LIBVER_MINOR_SCRIPT))
-LIBVER_PATCH := $(shell echo $(LIBVER_PATCH_SCRIPT))
-LIBVER  := $(shell echo $(LIBVER_SCRIPT))
-
-ZSTD_VERSION = $(LIBVER)
-
-HAVE_COLORNEVER = $(shell echo a | grep --color=never a > /dev/null 2> /dev/null && echo 1 || echo 0)
-GREP_OPTIONS ?=
-ifeq ($HAVE_COLORNEVER, 1)
-  GREP_OPTIONS += --color=never
-endif
-GREP = grep $(GREP_OPTIONS)
+include $(LIBZSTD)/libzstd.mk
 
 ifeq ($(shell $(CC) -v 2>&1 | $(GREP) -c "gcc version "), 1)
   ALIGN_LOOP = -falign-loops=32
@@ -50,71 +28,25 @@ else
   ALIGN_LOOP =
 endif
 
-DEBUGLEVEL ?= 0
-CPPFLAGS += -DXXH_NAMESPACE=ZSTD_ -DDEBUGLEVEL=$(DEBUGLEVEL)
-ifeq ($(OS),Windows_NT)   # MinGW assumed
-  CPPFLAGS += -D__USE_MINGW_ANSI_STDIO   # compatibility with %zu formatting
-endif
-CFLAGS   ?= -O3
-DEBUGFLAGS+=-Wall -Wextra -Wcast-qual -Wcast-align -Wshadow \
-            -Wstrict-aliasing=1 -Wswitch-enum -Wdeclaration-after-statement \
-            -Wstrict-prototypes -Wundef -Wpointer-arith \
-            -Wvla -Wformat=2 -Winit-self -Wfloat-equal -Wwrite-strings \
-            -Wredundant-decls -Wmissing-prototypes -Wc++-compat
-CFLAGS   += $(DEBUGFLAGS) $(MOREFLAGS)
-FLAGS     = $(CPPFLAGS) $(CFLAGS) $(LDFLAGS)
-
-ZSTDLIB_COMMON := $(ZSTDDIR)/common
-ZSTDLIB_COMPRESS := $(ZSTDDIR)/compress
-ZSTDLIB_DECOMPRESS := $(ZSTDDIR)/decompress
-ZDICT_DIR := $(ZSTDDIR)/dictBuilder
-ZSTDLEGACY_DIR := $(ZSTDDIR)/legacy
-
-vpath %.c $(ZSTDLIB_COMMON) $(ZSTDLIB_COMPRESS) $(ZSTDLIB_DECOMPRESS) $(ZDICT_DIR) $(ZSTDLEGACY_DIR)
-
-ZSTDLIB_COMMON_C := $(wildcard $(ZSTDLIB_COMMON)/*.c)
-ZSTDLIB_COMPRESS_C := $(wildcard $(ZSTDLIB_COMPRESS)/*.c)
-ZSTDLIB_DECOMPRESS_C := $(wildcard $(ZSTDLIB_DECOMPRESS)/*.c)
-ZSTDLIB_CORE_SRC := $(ZSTDLIB_DECOMPRESS_C) $(ZSTDLIB_COMMON_C) $(ZSTDLIB_COMPRESS_C)
-ZDICT_SRC := $(wildcard $(ZDICT_DIR)/*.c)
-
-ZSTD_LEGACY_SUPPORT ?= 5
-ZSTDLEGACY_SRC :=
-ifneq ($(ZSTD_LEGACY_SUPPORT), 0)
-ifeq ($(shell test $(ZSTD_LEGACY_SUPPORT) -lt 8; echo $$?), 0)
-  ZSTDLEGACY_SRC += $(shell ls $(ZSTDLEGACY_DIR)/*.c | $(GREP) 'v0[$(ZSTD_LEGACY_SUPPORT)-7]')
-endif
-endif
+ZSTDLIB_COMMON_SRC := $(sort $(ZSTD_COMMON_FILES))
+ZSTDLIB_COMPRESS_SRC := $(sort $(ZSTD_COMPRESS_FILES))
+ZSTDLIB_DECOMPRESS_SRC := $(sort $(ZSTD_DECOMPRESS_FILES))
+ZSTDLIB_CORE_SRC := $(sort $(ZSTD_DECOMPRESS_FILES) $(ZSTD_COMMON_FILES) $(ZSTD_COMPRESS_FILES))
+ZDICT_SRC := $(sort $(ZSTD_DICTBUILDER_FILES))
+ZSTDLEGACY_SRC := $(sort $(ZSTD_LEGACY_FILES))
 
 # Sort files in alphabetical order for reproducible builds
 ZSTDLIB_FULL_SRC = $(sort $(ZSTDLIB_CORE_SRC) $(ZSTDLEGACY_SRC) $(ZDICT_SRC))
-ZSTDLIB_LOCAL_SRC := $(notdir $(ZSTDLIB_FULL_SRC))
-ZSTDLIB_LOCAL_OBJ := $(ZSTDLIB_LOCAL_SRC:.c=.o)
+ZSTDLIB_LOCAL_SRC = $(notdir $(ZSTDLIB_FULL_SRC))
+ZSTDLIB_LOCAL_OBJ0 := $(ZSTDLIB_LOCAL_SRC:.c=.o)
+ZSTDLIB_LOCAL_OBJ := $(ZSTDLIB_LOCAL_OBJ0:.S=.o)
 
-ZSTD_CLI_SRC := $(wildcard *.c)
+ZSTD_CLI_SRC := $(sort $(wildcard *.c))
 ZSTD_CLI_OBJ := $(ZSTD_CLI_SRC:.c=.o)
 
-ZSTD_ALL_SRC := $(ZSTDLIB_LOCAL_SRC) $(ZSTD_CLI_SRC)
-ZSTD_ALL_OBJ := $(ZSTD_ALL_SRC:.c=.o)
-
-UNAME := $(shell uname)
-ifeq ($(UNAME), Darwin)
-  HASH ?= md5
-else ifeq ($(UNAME), FreeBSD)
-  HASH ?= gmd5sum
-else ifeq ($(UNAME), OpenBSD)
-  HASH ?= md5
-endif
-HASH ?= md5sum
-HAVE_HASH :=$(shell echo 1 | $(HASH) > /dev/null && echo 1 || echo 0)
-
-ifndef BUILD_DIR
-HASH_DIR = conf_$(shell echo $(CC) $(CPPFLAGS) $(CFLAGS) $(LDFLAGS) $(ZSTD_FILES) | $(HASH) | cut -f 1 -d " ")
-ifeq ($(HAVE_HASH),0)
-  $(info warning : could not find HASH ($(HASH)), needed to differentiate builds using different flags)
-  BUILD_DIR := obj/generic_noconf
-endif
-endif # BUILD_DIR
+ZSTD_ALL_SRC = $(ZSTDLIB_LOCAL_SRC) $(ZSTD_CLI_SRC)
+ZSTD_ALL_OBJ0 := $(ZSTD_ALL_SRC:.c=.o)
+ZSTD_ALL_OBJ := $(ZSTD_ALL_OBJ0:.S=.o)
 
 # Define *.exe as extension for Windows systems
 ifneq (,$(filter Windows%,$(OS)))
@@ -130,11 +62,6 @@ else
   EXT =
 endif
 
-VOID = /dev/null
-
-# Make 4.3 doesn't support '\#' anymore (https://lwn.net/Articles/810071/)
-NUM_SYMBOL := \#
-
 # thread detection
 NO_THREAD_MSG := ==> no threads, building without multithreading support
 HAVE_PTHREAD := $(shell printf '$(NUM_SYMBOL)include <pthread.h>\nint main(void) { return 0; }' > have_pthread.c && $(CC) $(FLAGS) -o have_pthread$(EXT) have_pthread.c -pthread 2> $(VOID) && rm have_pthread$(EXT) && echo 1 || echo 0; rm have_pthread.c)
@@ -149,7 +76,7 @@ endif
 
 # zlib detection
 NO_ZLIB_MSG := ==> no zlib, building zstd without .gz support
-HAVE_ZLIB := $(shell printf '$(NUM_SYMBOL)include <zlib.h>\nint main(void) { return 0; }' > have_zlib.c && $(CC) $(FLAGS) -o have_zlib$(EXT) have_zlib.c -lz 2> $(VOID) && rm have_zlib$(EXT) && echo 1 || echo 0; rm have_zlib.c)
+HAVE_ZLIB ?= $(shell printf '$(NUM_SYMBOL)include <zlib.h>\nint main(void) { return 0; }' > have_zlib.c && $(CC) $(FLAGS) -o have_zlib$(EXT) have_zlib.c -lz 2> $(VOID) && rm have_zlib$(EXT) && echo 1 || echo 0; rm have_zlib.c)
 ifeq ($(HAVE_ZLIB), 1)
   ZLIB_MSG := ==> building zstd with .gz compression support
   ZLIBCPP = -DZSTD_GZCOMPRESS -DZSTD_GZDECOMPRESS
@@ -160,7 +87,7 @@ endif
 
 # lzma detection
 NO_LZMA_MSG := ==> no liblzma, building zstd without .xz/.lzma support
-HAVE_LZMA := $(shell printf '$(NUM_SYMBOL)include <lzma.h>\nint main(void) { return 0; }' > have_lzma.c && $(CC) $(FLAGS) -o have_lzma$(EXT) have_lzma.c -llzma 2> $(VOID) && rm have_lzma$(EXT) && echo 1 || echo 0; rm have_lzma.c)
+HAVE_LZMA ?= $(shell printf '$(NUM_SYMBOL)include <lzma.h>\nint main(void) { return 0; }' > have_lzma.c && $(CC) $(FLAGS) -o have_lzma$(EXT) have_lzma.c -llzma 2> $(VOID) && rm have_lzma$(EXT) && echo 1 || echo 0; rm have_lzma.c)
 ifeq ($(HAVE_LZMA), 1)
   LZMA_MSG := ==> building zstd with .xz/.lzma compression support
   LZMACPP = -DZSTD_LZMACOMPRESS -DZSTD_LZMADECOMPRESS
@@ -171,7 +98,7 @@ endif
 
 # lz4 detection
 NO_LZ4_MSG := ==> no liblz4, building zstd without .lz4 support
-HAVE_LZ4 := $(shell printf '$(NUM_SYMBOL)include <lz4frame.h>\n$(NUM_SYMBOL)include <lz4.h>\nint main(void) { return 0; }' > have_lz4.c && $(CC) $(FLAGS) -o have_lz4$(EXT) have_lz4.c -llz4 2> $(VOID) && rm have_lz4$(EXT) && echo 1 || echo 0; rm have_lz4.c)
+HAVE_LZ4 ?= $(shell printf '$(NUM_SYMBOL)include <lz4frame.h>\n$(NUM_SYMBOL)include <lz4.h>\nint main(void) { return 0; }' > have_lz4.c && $(CC) $(FLAGS) -o have_lz4$(EXT) have_lz4.c -llz4 2> $(VOID) && rm have_lz4$(EXT) && echo 1 || echo 0; rm have_lz4.c)
 ifeq ($(HAVE_LZ4), 1)
   LZ4_MSG := ==> building zstd with .lz4 compression support
   LZ4CPP = -DZSTD_LZ4COMPRESS -DZSTD_LZ4DECOMPRESS
@@ -192,22 +119,25 @@ endif
 endif
 
 SET_CACHE_DIRECTORY = \
-	$(MAKE) --no-print-directory $@ \
+   +$(MAKE) --no-print-directory $@ \
     BUILD_DIR=obj/$(HASH_DIR) \
     CPPFLAGS="$(CPPFLAGS)" \
     CFLAGS="$(CFLAGS)" \
-    LDFLAGS="$(LDFLAGS)"
+    LDFLAGS="$(LDFLAGS)" \
+    LDLIBS="$(LDLIBS)" \
+    ZSTD_ALL_SRC="$(ZSTD_ALL_SRC)"
 
 
 .PHONY: all
 all: zstd
 
 .PHONY: allVariants
-allVariants: zstd zstd-compress zstd-decompress zstd-small zstd-nolegacy zstd-dictBuilder
+allVariants: zstd zstd-compress zstd-decompress zstd-small zstd-frugal zstd-nolegacy zstd-dictBuilder
 
 .PHONY: zstd  # must always be run
 zstd : CPPFLAGS += $(THREAD_CPP) $(ZLIBCPP) $(LZMACPP) $(LZ4CPP)
-zstd : LDFLAGS += $(THREAD_LD) $(ZLIBLD) $(LZMALD) $(LZ4LD) $(DEBUGFLAGS_LD)
+zstd : LDFLAGS += $(THREAD_LD) $(DEBUGFLAGS_LD)
+zstd : LDLIBS += $(ZLIBLD) $(LZMALD) $(LZ4LD)
 zstd : CPPFLAGS += -DZSTD_LEGACY_SUPPORT=$(ZSTD_LEGACY_SUPPORT)
 ifneq (,$(filter Windows%,$(OS)))
 zstd : $(RES_FILE)
@@ -229,11 +159,11 @@ $(BUILD_DIR)/zstd : $(ZSTD_OBJ)
 	@echo "$(LZMA_MSG)"
 	@echo "$(LZ4_MSG)"
 	@echo LINK $@
-	$(CC) $(FLAGS) $^ -o $@$(EXT) $(LDFLAGS)
+	$(CC) $(FLAGS) $^ $(LDLIBS) -o $@$(EXT)
 
 ifeq ($(HAVE_HASH),1)
-SRCBIN_HASH = $(shell cat $(BUILD_DIR)/zstd 2> $(VOID) | $(HASH) | cut -f 1 -d " ")
-DSTBIN_HASH = $(shell cat zstd 2> $(VOID) | $(HASH) | cut -f 1 -d " ")
+SRCBIN_HASH = $(shell cat $(BUILD_DIR)/zstd$(EXT) 2> $(VOID) | $(HASH) | cut -f 1 -d " ")
+DSTBIN_HASH = $(shell cat zstd$(EXT) 2> $(VOID) | $(HASH) | cut -f 1 -d " ")
 BIN_ISDIFFERENT = $(if $(filter $(SRCBIN_HASH),$(DSTBIN_HASH)),0,1)
 else
 BIN_ISDIFFERENT = 1
@@ -241,7 +171,7 @@ endif
 
 zstd : $(BUILD_DIR)/zstd
 	if [ $(BIN_ISDIFFERENT) -eq 1 ]; then \
-		cp -f $< $@; \
+		cp -f $<$(EXT) $@$(EXT); \
 		echo zstd build completed; \
 	else \
 		echo zstd already built; \
@@ -266,39 +196,38 @@ zstd32 : $(ZSTDLIB_FULL_SRC) $(ZSTD_CLI_SRC)
 
 ## zstd-nolegacy: same scope as zstd, with just support of legacy formats removed
 zstd-nolegacy : LDFLAGS += $(THREAD_LD) $(ZLIBLD) $(LZMALD) $(LZ4LD) $(DEBUGFLAGS_LD)
+zstd-nolegacy : CPPFLAGS += -UZSTD_LEGACY_SUPPORT -DZSTD_LEGACY_SUPPORT=0
 zstd-nolegacy : $(ZSTDLIB_CORE_SRC) $(ZDICT_SRC) $(ZSTD_CLI_OBJ)
 	$(CC) $(FLAGS) $^ -o $@$(EXT) $(LDFLAGS)
 
+.PHONY: zstd-nomt
 zstd-nomt : THREAD_CPP :=
 zstd-nomt : THREAD_LD  :=
 zstd-nomt : THREAD_MSG := - multi-threading disabled
 zstd-nomt : zstd
 
+.PHONY: zstd-nogz
 zstd-nogz : ZLIBCPP :=
 zstd-nogz : ZLIBLD  :=
 zstd-nogz : ZLIB_MSG := - gzip support is disabled
 zstd-nogz : zstd
 
+.PHONY: zstd-noxz
 zstd-noxz : LZMACPP :=
 zstd-noxz : LZMALD  :=
 zstd-noxz : LZMA_MSG := - xz/lzma support is disabled
 zstd-noxz : zstd
 
-## zstd-dll: zstd executable linked to dynamic library libzstd (must already exist)
-# note : the following target doesn't link
-#        because zstd uses non-public symbols from libzstd
-#        such as XXH64 (for benchmark),
-#        ZDICT_trainFromBuffer_unsafe_legacy (for dictionary builder)
-#        and ZSTD_cycleLog (likely for --patch-from).
-#        It's unclear at this stage if this is a scenario that must be supported
+## zstd-dll: zstd executable linked to dynamic library libzstd (must have same version)
 .PHONY: zstd-dll
-zstd-dll : LDFLAGS+= -L$(ZSTDDIR) -lzstd
-zstd-dll : ZSTDLIB_FULL_SRC =
-zstd-dll : $(ZSTD_CLI_OBJ)
-	$(CC) $(FLAGS) $^ -o $@$(EXT) $(LDFLAGS)
+zstd-dll : LDFLAGS+= -L$(LIBZSTD)
+zstd-dll : LDLIBS += -lzstd
+zstd-dll : ZSTDLIB_LOCAL_SRC = xxhash.c
+zstd-dll : zstd
 
 
 ## zstd-pgo: zstd executable optimized with PGO.
+.PHONY: zstd-pgo
 zstd-pgo :
 	$(MAKE) clean
 	$(MAKE) zstd MOREFLAGS=-fprofile-generate
@@ -315,18 +244,17 @@ zstd-pgo :
 ## zstd-small: minimal target, supporting only zstd compression and decompression. no bench. no legacy. no other format.
 zstd-small: CFLAGS = -Os -s
 zstd-frugal zstd-small: $(ZSTDLIB_CORE_SRC) zstdcli.c util.c timefn.c fileio.c
-	$(CC) $(FLAGS) -DZSTD_NOBENCH -DZSTD_NODICT $^ -o $@$(EXT)
+	$(CC) $(FLAGS) -DZSTD_NOBENCH -DZSTD_NODICT -DZSTD_NOTRACE -UZSTD_LEGACY_SUPPORT -DZSTD_LEGACY_SUPPORT=0 $^ -o $@$(EXT)
 
-zstd-decompress: $(ZSTDLIB_COMMON_C) $(ZSTDLIB_DECOMPRESS_C) zstdcli.c util.c timefn.c fileio.c
-	$(CC) $(FLAGS) -DZSTD_NOBENCH -DZSTD_NODICT -DZSTD_NOCOMPRESS $^ -o $@$(EXT)
+zstd-decompress: $(ZSTDLIB_COMMON_SRC) $(ZSTDLIB_DECOMPRESS_SRC) zstdcli.c util.c timefn.c fileio.c
+	$(CC) $(FLAGS) -DZSTD_NOBENCH -DZSTD_NODICT -DZSTD_NOCOMPRESS -DZSTD_NOTRACE -UZSTD_LEGACY_SUPPORT -DZSTD_LEGACY_SUPPORT=0 $^ -o $@$(EXT)
 
-zstd-compress: $(ZSTDLIB_COMMON_C) $(ZSTDLIB_COMPRESS_C) zstdcli.c util.c timefn.c fileio.c
-	$(CC) $(FLAGS) -DZSTD_NOBENCH -DZSTD_NODICT -DZSTD_NODECOMPRESS $^ -o $@$(EXT)
+zstd-compress: $(ZSTDLIB_COMMON_SRC) $(ZSTDLIB_COMPRESS_SRC) zstdcli.c util.c timefn.c fileio.c
+	$(CC) $(FLAGS) -DZSTD_NOBENCH -DZSTD_NODICT -DZSTD_NODECOMPRESS -DZSTD_NOTRACE -UZSTD_LEGACY_SUPPORT -DZSTD_LEGACY_SUPPORT=0 $^ -o $@$(EXT)
 
 ## zstd-dictBuilder: executable supporting dictionary creation and compression (only)
-zstd-dictBuilder: CPPFLAGS += -DZSTD_NOBENCH -DZSTD_NODECOMPRESS
-zstd-dictBuilder: $(ZSTDLIB_COMMON_C) $(ZSTDLIB_COMPRESS_C) $(ZDICT_SRC) zstdcli.c util.c timefn.c fileio.c dibio.c
-	$(CC) $(FLAGS) $^ -o $@$(EXT)
+zstd-dictBuilder: $(ZSTDLIB_COMMON_SRC) $(ZSTDLIB_COMPRESS_SRC) $(ZDICT_SRC) zstdcli.c util.c timefn.c fileio.c dibio.c
+	$(CC) $(FLAGS) -DZSTD_NOBENCH -DZSTD_NODECOMPRESS -DZSTD_NOTRACE $^ -o $@$(EXT)
 
 zstdmt: zstd
 	ln -sf zstd zstdmt
@@ -346,9 +274,11 @@ endif
 .PHONY: clean
 clean:
 	$(RM) core *.o tmp* result* *.gcda dictionary *.zst \
-        zstd$(EXT) zstd32$(EXT) zstd-compress$(EXT) zstd-decompress$(EXT) \
+        zstd$(EXT) zstd32$(EXT) zstd-dll$(EXT) \
+        zstd-compress$(EXT) zstd-decompress$(EXT) \
         zstd-small$(EXT) zstd-frugal$(EXT) zstd-nolegacy$(EXT) zstd4$(EXT) \
-        zstd-dictBuilder$(EXT) *.gcda default*.profraw default.profdata have_zlib$(EXT)
+        zstd-dictBuilder$(EXT) \
+        *.gcda default*.profraw default.profdata have_zlib$(EXT)
 	$(RM) -r obj/*
 	@echo Cleaning completed
 
@@ -388,6 +318,10 @@ $(BUILD_DIR)/%.o : %.c $(BUILD_DIR)/%.d | $(BUILD_DIR)
 	@echo CC $@
 	$(COMPILE.c) $(DEPFLAGS) $(BUILD_DIR)/$*.d $(OUTPUT_OPTION) $<
 
+$(BUILD_DIR)/%.o : %.S | $(BUILD_DIR)
+	@echo AS $@
+	$(COMPILE.S) $(OUTPUT_OPTION) $<
+
 MKDIR ?= mkdir
 $(BUILD_DIR): ; $(MKDIR) -p $@
 
@@ -401,7 +335,7 @@ include $(wildcard $(DEPFILES))
 #-----------------------------------------------------------------------------
 # make install is validated only for Linux, macOS, BSD, Hurd and Solaris targets
 #-----------------------------------------------------------------------------
-ifneq (,$(filter $(UNAME),Linux Darwin GNU/kFreeBSD GNU OpenBSD FreeBSD NetBSD DragonFly SunOS Haiku))
+ifneq (,$(filter $(UNAME),Linux Darwin GNU/kFreeBSD GNU OpenBSD FreeBSD NetBSD DragonFly SunOS Haiku AIX))
 
 HAVE_COLORNEVER = $(shell echo a | egrep --color=never a > /dev/null 2> /dev/null && echo 1 || echo 0)
 EGREP_OPTIONS ?=
diff --git a/programs/README.md b/programs/README.md
index cf7f5ba46d12..5570f90c3b4f 100644
--- a/programs/README.md
+++ b/programs/README.md
@@ -156,7 +156,7 @@ Arguments :
 
 Advanced arguments :
  -V     : display Version number and exit
- -c     : force write to standard output, even if it is the console
+ -c     : write to standard output (even if it is the console)
  -v     : verbose mode; specify multiple times to increase verbosity
  -q     : suppress warnings; specify twice to suppress errors too
 --no-progress : do not display the progress counter
@@ -172,6 +172,7 @@ Advanced compression arguments :
 --long[=#]: enable long distance matching with given window log (default: 27)
 --fast[=#]: switch to very fast compression levels (default: 1)
 --adapt : dynamically adapt compression level to I/O conditions
+--patch-from=FILE : specify the file to be used as a reference point for zstd's diff engine
  -T#    : spawns # compression threads (default: 1, 0==# cores)
  -B#    : select size of each job (default: 0==automatic)
 --single-thread : use a single thread for both I/O and compression (result slightly different than -T1)
@@ -224,7 +225,8 @@ Therefore, this avenue is intentionally restricted and only supports `ZSTD_CLEVE
 that `zstd` will use for compression, which by default is `1`.
 This functionality only exists when `zstd` is compiled with multithread support.
 `0` means "use as many threads as detected cpu cores on local system".
-The max # of threads is capped at: `ZSTDMT_NBWORKERS_MAX==200`.
+The max # of threads is capped at `ZSTDMT_NBWORKERS_MAX`,
+which is either 64 in 32-bit mode, or 256 for 64-bit environments.
 
 This functionality can be useful when `zstd` CLI is invoked in a way that doesn't allow passing arguments.
 One such scenario is `tar --zstd`.
diff --git a/programs/benchfn.c b/programs/benchfn.c
index ed7273afb6e5..1aadbdd9136c 100644
--- a/programs/benchfn.c
+++ b/programs/benchfn.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * Copyright (c) Yann Collet, Facebook, Inc.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
diff --git a/programs/benchfn.h b/programs/benchfn.h
index e555bbe6ae32..590f292eaa61 100644
--- a/programs/benchfn.h
+++ b/programs/benchfn.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * Copyright (c) Yann Collet, Facebook, Inc.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
diff --git a/programs/benchzstd.c b/programs/benchzstd.c
index 77056203d55e..fa2659efbbba 100644
--- a/programs/benchzstd.c
+++ b/programs/benchzstd.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * Copyright (c) Yann Collet, Facebook, Inc.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -31,12 +31,17 @@
 #include "timefn.h"      /* UTIL_time_t */
 #include "benchfn.h"
 #include "../lib/common/mem.h"
+#ifndef ZSTD_STATIC_LINKING_ONLY
 #define ZSTD_STATIC_LINKING_ONLY
+#endif
 #include "../lib/zstd.h"
 #include "datagen.h"     /* RDG_genBuffer */
+#ifndef XXH_INLINE_ALL
+#define XXH_INLINE_ALL
+#endif
 #include "../lib/common/xxhash.h"
 #include "benchzstd.h"
-#include "../lib/common/zstd_errors.h"
+#include "../lib/zstd_errors.h"
 
 
 /* *************************************
@@ -67,17 +72,11 @@ static const size_t maxMemory = (sizeof(size_t)==4)  ?
 /* *************************************
 *  console display
 ***************************************/
-#define DISPLAY(...)         fprintf(stderr, __VA_ARGS__)
+#define DISPLAY(...)         { fprintf(stderr, __VA_ARGS__); fflush(NULL); }
 #define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); }
 /* 0 : no display;   1: errors;   2 : + result + interaction + warnings;   3 : + progression;   4 : + information */
-
-static const U64 g_refreshRate = SEC_TO_MICRO / 6;
-static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER;
-
-#define DISPLAYUPDATE(l, ...) { if (displayLevel>=l) { \
-            if ((UTIL_clockSpanMicro(g_displayClock) > g_refreshRate) || (displayLevel>=4)) \
-            { g_displayClock = UTIL_getTime(); DISPLAY(__VA_ARGS__); \
-            if (displayLevel>=4) fflush(stderr); } } }
+#define OUTPUT(...)          { fprintf(stdout, __VA_ARGS__); fflush(NULL); }
+#define OUTPUTLEVEL(l, ...)  if (displayLevel>=l) { OUTPUT(__VA_ARGS__); }
 
 
 /* *************************************
@@ -137,7 +136,8 @@ BMK_advancedParams_t BMK_initAdvancedParams(void) {
         0, /* ldmHashLog */
         0, /* ldmBuckSizeLog */
         0,  /* ldmHashRateLog */
-        ZSTD_lcm_auto /* literalCompressionMode */
+        ZSTD_ps_auto, /* literalCompressionMode */
+        0 /* useRowMatchFinder */
     };
     return res;
 }
@@ -175,6 +175,7 @@ BMK_initCCtx(ZSTD_CCtx* ctx,
         CHECK_Z(ZSTD_CCtx_setParameter(ctx, ZSTD_c_nbWorkers, adv->nbWorkers));
     }
     CHECK_Z(ZSTD_CCtx_setParameter(ctx, ZSTD_c_compressionLevel, cLevel));
+    CHECK_Z(ZSTD_CCtx_setParameter(ctx, ZSTD_c_useRowMatchFinder, adv->useRowMatchFinder));
     CHECK_Z(ZSTD_CCtx_setParameter(ctx, ZSTD_c_enableLongDistanceMatching, adv->ldmFlag));
     CHECK_Z(ZSTD_CCtx_setParameter(ctx, ZSTD_c_ldmMinMatch, adv->ldmMinMatch));
     CHECK_Z(ZSTD_CCtx_setParameter(ctx, ZSTD_c_ldmHashLog, adv->ldmHashLog));
@@ -187,7 +188,7 @@ BMK_initCCtx(ZSTD_CCtx* ctx,
     CHECK_Z(ZSTD_CCtx_setParameter(ctx, ZSTD_c_minMatch, (int)comprParams->minMatch));
     CHECK_Z(ZSTD_CCtx_setParameter(ctx, ZSTD_c_targetLength, (int)comprParams->targetLength));
     CHECK_Z(ZSTD_CCtx_setParameter(ctx, ZSTD_c_literalCompressionMode, (int)adv->literalCompressionMode));
-    CHECK_Z(ZSTD_CCtx_setParameter(ctx, ZSTD_c_strategy, comprParams->strategy));
+    CHECK_Z(ZSTD_CCtx_setParameter(ctx, ZSTD_c_strategy, (int)comprParams->strategy));
     CHECK_Z(ZSTD_CCtx_loadDictionary(ctx, dictBuffer, dictBufferSize));
 }
 
@@ -377,10 +378,7 @@ BMK_benchMemAdvancedNoAlloc(
                 if (adv->mode == BMK_decodeOnly) {
                     cSizes[nbBlocks] = thisBlockSize;
                     benchResult.cSize = thisBlockSize;
-                }
-            }
-        }
-    }
+    }   }   }   }
 
     /* warming up `compressedBuffer` */
     if (adv->mode == BMK_decodeOnly) {
@@ -435,8 +433,9 @@ BMK_benchMemAdvancedNoAlloc(
         dctxprep.dictBuffer = dictBuffer;
         dctxprep.dictBufferSize = dictBufferSize;
 
-        DISPLAYLEVEL(2, "\r%70s\r", "");   /* blank line */
-        DISPLAYLEVEL(2, "%2s-%-17.17s :%10u ->\r", marks[markNb], displayName, (unsigned)srcSize);
+        OUTPUTLEVEL(2, "\r%70s\r", "");   /* blank line */
+        assert(srcSize < UINT_MAX);
+        OUTPUTLEVEL(2, "%2s-%-17.17s :%10u -> \r", marks[markNb], displayName, (unsigned)srcSize);
 
         while (!(compressionCompleted && decompressionCompleted)) {
             if (!compressionCompleted) {
@@ -448,7 +447,7 @@ BMK_benchMemAdvancedNoAlloc(
 
                 {   BMK_runTime_t const cResult = BMK_extract_runTime(cOutcome);
                     cSize = cResult.sumOfReturn;
-                    ratio = (double)srcSize / cSize;
+                    ratio = (double)srcSize / (double)cSize;
                     {   BMK_benchResult_t newResult;
                         newResult.cSpeed = (U64)((double)srcSize * TIMELOOP_NANOSEC / cResult.nanoSecPerRun);
                         benchResult.cSize = cSize;
@@ -457,11 +456,12 @@ BMK_benchMemAdvancedNoAlloc(
                 }   }
 
                 {   int const ratioAccuracy = (ratio < 10.) ? 3 : 2;
-                    DISPLAYLEVEL(2, "%2s-%-17.17s :%10u ->%10u (%5.*f),%6.*f MB/s\r",
+                    assert(cSize < UINT_MAX);
+                    OUTPUTLEVEL(2, "%2s-%-17.17s :%10u ->%10u (x%5.*f), %6.*f MB/s \r",
                             marks[markNb], displayName,
                             (unsigned)srcSize, (unsigned)cSize,
                             ratioAccuracy, ratio,
-                            benchResult.cSpeed < (10 MB) ? 2 : 1, (double)benchResult.cSpeed / MB_UNIT);
+                            benchResult.cSpeed < (10 * MB_UNIT) ? 2 : 1, (double)benchResult.cSpeed / MB_UNIT);
                 }
                 compressionCompleted = BMK_isCompleted_TimedFn(timeStateCompress);
             }
@@ -480,11 +480,11 @@ BMK_benchMemAdvancedNoAlloc(
                 }
 
                 {   int const ratioAccuracy = (ratio < 10.) ? 3 : 2;
-                    DISPLAYLEVEL(2, "%2s-%-17.17s :%10u ->%10u (%5.*f),%6.*f MB/s ,%6.1f MB/s \r",
+                    OUTPUTLEVEL(2, "%2s-%-17.17s :%10u ->%10u (x%5.*f), %6.*f MB/s, %6.1f MB/s\r",
                             marks[markNb], displayName,
                             (unsigned)srcSize, (unsigned)cSize,
                             ratioAccuracy, ratio,
-                            benchResult.cSpeed < (10 MB) ? 2 : 1, (double)benchResult.cSpeed / MB_UNIT,
+                            benchResult.cSpeed < (10 * MB_UNIT) ? 2 : 1, (double)benchResult.cSpeed / MB_UNIT,
                             (double)benchResult.dSpeed / MB_UNIT);
                 }
                 decompressionCompleted = BMK_isCompleted_TimedFn(timeStateDecompress);
@@ -521,7 +521,7 @@ BMK_benchMemAdvancedNoAlloc(
                                 DISPLAY("%02X ", ((const BYTE*)srcBuffer)[u+n]);
                             DISPLAY(" \n");
                             DISPLAY("decode: ");
-                            for (n=lowest; n>0; n++)
+                            for (n=lowest; n>0; n--)
                                 DISPLAY("%02X ", resultBuffer[u-n]);
                             DISPLAY(" :%02X:  ", resultBuffer[u]);
                             for (n=1; n<3; n++)
@@ -541,13 +541,13 @@ BMK_benchMemAdvancedNoAlloc(
             double const cSpeed = (double)benchResult.cSpeed / MB_UNIT;
             double const dSpeed = (double)benchResult.dSpeed / MB_UNIT;
             if (adv->additionalParam) {
-                DISPLAY("-%-3i%11i (%5.3f) %6.2f MB/s %6.1f MB/s  %s (param=%d)\n", cLevel, (int)cSize, ratio, cSpeed, dSpeed, displayName, adv->additionalParam);
+                OUTPUT("-%-3i%11i (%5.3f) %6.2f MB/s %6.1f MB/s  %s (param=%d)\n", cLevel, (int)cSize, ratio, cSpeed, dSpeed, displayName, adv->additionalParam);
             } else {
-                DISPLAY("-%-3i%11i (%5.3f) %6.2f MB/s %6.1f MB/s  %s\n", cLevel, (int)cSize, ratio, cSpeed, dSpeed, displayName);
+                OUTPUT("-%-3i%11i (%5.3f) %6.2f MB/s %6.1f MB/s  %s\n", cLevel, (int)cSize, ratio, cSpeed, dSpeed, displayName);
             }
         }
 
-        DISPLAYLEVEL(2, "%2i#\n", cLevel);
+        OUTPUTLEVEL(2, "%2i#\n", cLevel);
     }   /* Bench */
 
     benchResult.cMem = (1ULL << (comprParams->windowLog)) + ZSTD_sizeof_CCtx(cctx);
@@ -676,7 +676,7 @@ static BMK_benchOutcome_t BMK_benchCLevel(const void* srcBuffer, size_t benchedS
     }
 
     if (displayLevel == 1 && !adv->additionalParam)   /* --quiet mode */
-        DISPLAY("bench %s %s: input %u bytes, %u seconds, %u KB blocks\n",
+        OUTPUT("bench %s %s: input %u bytes, %u seconds, %u KB blocks\n",
                 ZSTD_VERSION_STRING, ZSTD_GIT_COMMIT_STRING,
                 (unsigned)benchedSize, adv->nbSeconds, (unsigned)(adv->blockSize>>10));
 
@@ -766,7 +766,7 @@ static int BMK_loadFiles(void* buffer, size_t bufferSize,
         }
         {   FILE* const f = fopen(fileNamesTable[n], "rb");
             if (f==NULL) RETURN_ERROR_INT(10, "impossible to open file %s", fileNamesTable[n]);
-            DISPLAYUPDATE(2, "Loading %s...       \r", fileNamesTable[n]);
+            OUTPUTLEVEL(2, "Loading %s...       \r", fileNamesTable[n]);
             if (fileSize > bufferSize-pos) fileSize = bufferSize-pos, nbFiles=n;   /* buffer too small - stop after this file */
             {   size_t const readSize = fread(((char*)buffer)+pos, 1, (size_t)fileSize, f);
                 if (readSize != (size_t)fileSize) RETURN_ERROR_INT(11, "could not read %s", fileNamesTable[n]);
@@ -803,6 +803,10 @@ BMK_benchOutcome_t BMK_benchFilesAdvanced(
         RETURN_ERROR(15, BMK_benchOutcome_t, "Invalid Compression Level");
     }
 
+    if (totalSizeToLoad == UTIL_FILESIZE_UNKNOWN) {
+        RETURN_ERROR(9, BMK_benchOutcome_t, "Error loading files");
+    }
+
     fileSizes = (size_t*)calloc(nbFiles, sizeof(size_t));
     if (!fileSizes) RETURN_ERROR(12, BMK_benchOutcome_t, "not enough memory for fileSizes");
 
diff --git a/programs/benchzstd.h b/programs/benchzstd.h
index 8c55b3c4f297..11ac85da7f94 100644
--- a/programs/benchzstd.h
+++ b/programs/benchzstd.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * Copyright (c) Yann Collet, Facebook, Inc.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -116,7 +116,8 @@ typedef struct {
     int ldmHashLog;
     int ldmBucketSizeLog;
     int ldmHashRateLog;
-    ZSTD_literalCompressionMode_e literalCompressionMode;
+    ZSTD_paramSwitch_e literalCompressionMode;
+    int useRowMatchFinder;  /* use row-based matchfinder if possible */
 } BMK_advancedParams_t;
 
 /* returns default parameters used by nonAdvanced functions */
diff --git a/programs/datagen.c b/programs/datagen.c
index 4353b7ff9943..3b4f9e5c7b69 100644
--- a/programs/datagen.c
+++ b/programs/datagen.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * Copyright (c) Yann Collet, Facebook, Inc.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
diff --git a/programs/datagen.h b/programs/datagen.h
index 5a2682d8f9f6..b76ae2a2225c 100644
--- a/programs/datagen.h
+++ b/programs/datagen.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * Copyright (c) Yann Collet, Facebook, Inc.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
diff --git a/programs/dibio.c b/programs/dibio.c
index cb3829e3e59b..d19f954486f0 100644
--- a/programs/dibio.c
+++ b/programs/dibio.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * Copyright (c) Yann Collet, Facebook, Inc.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -31,7 +31,6 @@
 
 #include "timefn.h"         /* UTIL_time_t, UTIL_clockSpanMicro, UTIL_getTime */
 #include "../lib/common/mem.h"  /* read */
-#include "../lib/common/error_private.h"
 #include "dibio.h"
 
 
@@ -49,6 +48,7 @@
 static const size_t g_maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t));
 
 #define NOISELENGTH 32
+#define MAX_SAMPLES_SIZE (2 GB) /* training dataset limited to 2GB */
 
 
 /*-*************************************
@@ -88,6 +88,15 @@ static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER;
 #undef MIN
 #define MIN(a,b)    ((a) < (b) ? (a) : (b))
 
+/**
+  Returns the size of a file.
+  If error returns -1.
+*/
+static S64 DiB_getFileSize (const char * fileName)
+{
+    U64 const fileSize = UTIL_getFileSize(fileName);
+    return (fileSize == UTIL_FILESIZE_UNKNOWN) ? -1 : (S64)fileSize;
+}
 
 /* ********************************************************
 *  File related operations
@@ -101,47 +110,67 @@ static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER;
  * *bufferSizePtr is modified, it provides the amount data loaded within buffer.
  *  sampleSizes is filled with the size of each sample.
  */
-static unsigned DiB_loadFiles(void* buffer, size_t* bufferSizePtr,
-                              size_t* sampleSizes, unsigned sstSize,
-                              const char** fileNamesTable, unsigned nbFiles, size_t targetChunkSize,
-                              unsigned displayLevel)
+static int DiB_loadFiles(
+    void* buffer, size_t* bufferSizePtr,
+    size_t* sampleSizes, int sstSize,
+    const char** fileNamesTable, int nbFiles,
+    size_t targetChunkSize, int displayLevel )
 {
     char* const buff = (char*)buffer;
-    size_t pos = 0;
-    unsigned nbLoadedChunks = 0, fileIndex;
-
-    for (fileIndex=0; fileIndex<nbFiles; fileIndex++) {
-        const char* const fileName = fileNamesTable[fileIndex];
-        unsigned long long const fs64 = UTIL_getFileSize(fileName);
-        unsigned long long remainingToLoad = (fs64 == UTIL_FILESIZE_UNKNOWN) ? 0 : fs64;
-        U32 const nbChunks = targetChunkSize ? (U32)((fs64 + (targetChunkSize-1)) / targetChunkSize) : 1;
-        U64 const chunkSize = targetChunkSize ? MIN(targetChunkSize, fs64) : fs64;
-        size_t const maxChunkSize = (size_t)MIN(chunkSize, SAMPLESIZE_MAX);
-        U32 cnb;
-        FILE* const f = fopen(fileName, "rb");
-        if (f==NULL) EXM_THROW(10, "zstd: dictBuilder: %s %s ", fileName, strerror(errno));
-        DISPLAYUPDATE(2, "Loading %s...       \r", fileName);
-        for (cnb=0; cnb<nbChunks; cnb++) {
-            size_t const toLoad = (size_t)MIN(maxChunkSize, remainingToLoad);
-            if (toLoad > *bufferSizePtr-pos) break;
-            {   size_t const readSize = fread(buff+pos, 1, toLoad, f);
-                if (readSize != toLoad) EXM_THROW(11, "Pb reading %s", fileName);
-                pos += readSize;
-                sampleSizes[nbLoadedChunks++] = toLoad;
-                remainingToLoad -= targetChunkSize;
-                if (nbLoadedChunks == sstSize) { /* no more space left in sampleSizes table */
-                    fileIndex = nbFiles;  /* stop there */
+    size_t totalDataLoaded = 0;
+    int nbSamplesLoaded = 0;
+    int fileIndex = 0;
+    FILE * f = NULL;
+
+    assert(targetChunkSize <= SAMPLESIZE_MAX);
+
+    while ( nbSamplesLoaded < sstSize && fileIndex < nbFiles ) {
+        size_t fileDataLoaded;
+        S64 const fileSize = DiB_getFileSize(fileNamesTable[fileIndex]);
+        if (fileSize <= 0) /* skip if zero-size or file error */
+            continue;
+
+        f = fopen( fileNamesTable[fileIndex], "rb");
+        if (f == NULL)
+            EXM_THROW(10, "zstd: dictBuilder: %s %s ", fileNamesTable[fileIndex], strerror(errno));
+        DISPLAYUPDATE(2, "Loading %s...       \r", fileNamesTable[fileIndex]);
+
+        /* Load the first chunk of data from the file */
+        fileDataLoaded = targetChunkSize > 0 ?
+                            (size_t)MIN(fileSize, (S64)targetChunkSize) :
+                            (size_t)MIN(fileSize, SAMPLESIZE_MAX );
+        if (totalDataLoaded + fileDataLoaded > *bufferSizePtr)
+            break;
+        if (fread( buff+totalDataLoaded, 1, fileDataLoaded, f ) != fileDataLoaded)
+            EXM_THROW(11, "Pb reading %s", fileNamesTable[fileIndex]);
+        sampleSizes[nbSamplesLoaded++] = fileDataLoaded;
+        totalDataLoaded += fileDataLoaded;
+
+        /* If file-chunking is enabled, load the rest of the file as more samples */
+        if (targetChunkSize > 0) {
+            while( (S64)fileDataLoaded < fileSize && nbSamplesLoaded < sstSize ) {
+                size_t const chunkSize = MIN((size_t)(fileSize-fileDataLoaded), targetChunkSize);
+                if (totalDataLoaded + chunkSize > *bufferSizePtr) /* buffer is full */
                     break;
-                }
-                if (toLoad < targetChunkSize) {
-                    fseek(f, (long)(targetChunkSize - toLoad), SEEK_CUR);
-        }   }   }
-        fclose(f);
+
+                if (fread( buff+totalDataLoaded, 1, chunkSize, f ) != chunkSize)
+                    EXM_THROW(11, "Pb reading %s", fileNamesTable[fileIndex]);
+                sampleSizes[nbSamplesLoaded++] = chunkSize;
+                totalDataLoaded += chunkSize;
+                fileDataLoaded += chunkSize;
+            }
+        }
+        fileIndex += 1;
+        fclose(f); f = NULL;
     }
+    if (f != NULL)
+        fclose(f);
+
     DISPLAYLEVEL(2, "\r%79s\r", "");
-    *bufferSizePtr = pos;
-    DISPLAYLEVEL(4, "loaded : %u KB \n", (unsigned)(pos >> 10))
-    return nbLoadedChunks;
+    DISPLAYLEVEL(4, "Loaded %d KB total training data, %d nb samples \n",
+        (int)(totalDataLoaded / (1 KB)), nbSamplesLoaded );
+    *bufferSizePtr = totalDataLoaded;
+    return nbSamplesLoaded;
 }
 
 #define DiB_rotl32(x,r) ((x << r) | (x >> (32 - r)))
@@ -223,11 +252,10 @@ static void DiB_saveDict(const char* dictFileName,
       if (n!=0) EXM_THROW(5, "%s : flush error", dictFileName) }
 }
 
-
 typedef struct {
-    U64 totalSizeToLoad;
-    unsigned oneSampleTooLarge;
-    unsigned nbSamples;
+    S64 totalSizeToLoad;
+    int nbSamples;
+    int oneSampleTooLarge;
 } fileStats;
 
 /*! DiB_fileStats() :
@@ -235,58 +263,92 @@ typedef struct {
  *  provides the amount of data to be loaded and the resulting nb of samples.
  *  This is useful primarily for allocation purpose => sample buffer, and sample sizes table.
  */
-static fileStats DiB_fileStats(const char** fileNamesTable, unsigned nbFiles, size_t chunkSize, unsigned displayLevel)
+static fileStats DiB_fileStats(const char** fileNamesTable, int nbFiles, size_t chunkSize, int displayLevel)
 {
     fileStats fs;
-    unsigned n;
+    int n;
     memset(&fs, 0, sizeof(fs));
+
+    // We assume that if chunking is requested, the chunk size is < SAMPLESIZE_MAX
+    assert( chunkSize <= SAMPLESIZE_MAX );
+
     for (n=0; n<nbFiles; n++) {
-        U64 const fileSize = UTIL_getFileSize(fileNamesTable[n]);
-        U64 const srcSize = (fileSize == UTIL_FILESIZE_UNKNOWN) ? 0 : fileSize;
-        U32 const nbSamples = (U32)(chunkSize ? (srcSize + (chunkSize-1)) / chunkSize : 1);
-        U64 const chunkToLoad = chunkSize ? MIN(chunkSize, srcSize) : srcSize;
-        size_t const cappedChunkSize = (size_t)MIN(chunkToLoad, SAMPLESIZE_MAX);
-        fs.totalSizeToLoad += cappedChunkSize * nbSamples;
-        fs.oneSampleTooLarge |= (chunkSize > 2*SAMPLESIZE_MAX);
-        fs.nbSamples += nbSamples;
+      S64 const fileSize = DiB_getFileSize(fileNamesTable[n]);
+      // TODO: is there a minimum sample size? What if the file is 1-byte?
+      if (fileSize == 0) {
+        DISPLAYLEVEL(3, "Sample file '%s' has zero size, skipping...\n", fileNamesTable[n]);
+        continue;
+      }
+
+      /* the case where we are breaking up files in sample chunks */
+      if (chunkSize > 0)
+      {
+        // TODO: is there a minimum sample size? Can we have a 1-byte sample?
+        fs.nbSamples += (int)((fileSize + chunkSize-1) / chunkSize);
+        fs.totalSizeToLoad += fileSize;
+      }
+      else {
+      /* the case where one file is one sample */
+        if (fileSize > SAMPLESIZE_MAX) {
+          /* flag excessively large sample files */
+          fs.oneSampleTooLarge |= (fileSize > 2*SAMPLESIZE_MAX);
+
+          /* Limit to the first SAMPLESIZE_MAX (128kB) of the file */
+          DISPLAYLEVEL(3, "Sample file '%s' is too large, limiting to %d KB",
+              fileNamesTable[n], SAMPLESIZE_MAX / (1 KB));
+        }
+        fs.nbSamples += 1;
+        fs.totalSizeToLoad += MIN(fileSize, SAMPLESIZE_MAX);
+      }
     }
-    DISPLAYLEVEL(4, "Preparing to load : %u KB \n", (unsigned)(fs.totalSizeToLoad >> 10));
+    DISPLAYLEVEL(4, "Found training data %d files, %d KB, %d samples\n", nbFiles, (int)(fs.totalSizeToLoad / (1 KB)), fs.nbSamples);
     return fs;
 }
 
-
-/*! ZDICT_trainFromBuffer_unsafe_legacy() :
-    Strictly Internal use only !!
-    Same as ZDICT_trainFromBuffer_legacy(), but does not control `samplesBuffer`.
-    `samplesBuffer` must be followed by noisy guard band to avoid out-of-buffer reads.
-    @return : size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
-              or an error code.
-*/
-size_t ZDICT_trainFromBuffer_unsafe_legacy(void* dictBuffer, size_t dictBufferCapacity,
-                                           const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
-                                           ZDICT_legacy_params_t parameters);
-
-
-int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
-                       const char** fileNamesTable, unsigned nbFiles, size_t chunkSize,
+int DiB_trainFromFiles(const char* dictFileName, size_t maxDictSize,
+                       const char** fileNamesTable, int nbFiles, size_t chunkSize,
                        ZDICT_legacy_params_t* params, ZDICT_cover_params_t* coverParams,
-                       ZDICT_fastCover_params_t* fastCoverParams, int optimize)
+                       ZDICT_fastCover_params_t* fastCoverParams, int optimize, unsigned memLimit)
 {
-    unsigned const displayLevel = params ? params->zParams.notificationLevel :
-                        coverParams ? coverParams->zParams.notificationLevel :
-                        fastCoverParams ? fastCoverParams->zParams.notificationLevel :
-                        0;   /* should never happen */
+    fileStats fs;
+    size_t* sampleSizes; /* vector of sample sizes. Each sample can be up to SAMPLESIZE_MAX */
+    int nbSamplesLoaded; /* nb of samples effectively loaded in srcBuffer */
+    size_t loadedSize; /* total data loaded in srcBuffer for all samples */
+    void* srcBuffer /* contiguous buffer with training data/samples */;
     void* const dictBuffer = malloc(maxDictSize);
-    fileStats const fs = DiB_fileStats(fileNamesTable, nbFiles, chunkSize, displayLevel);
-    size_t* const sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t));
-    size_t const memMult = params ? MEMMULT :
-                           coverParams ? COVER_MEMMULT:
-                           FASTCOVER_MEMMULT;
-    size_t const maxMem =  DiB_findMaxMem(fs.totalSizeToLoad * memMult) / memMult;
-    size_t loadedSize = (size_t) MIN ((unsigned long long)maxMem, fs.totalSizeToLoad);
-    void* const srcBuffer = malloc(loadedSize+NOISELENGTH);
     int result = 0;
 
+    int const displayLevel = params ? params->zParams.notificationLevel :
+        coverParams ? coverParams->zParams.notificationLevel :
+        fastCoverParams ? fastCoverParams->zParams.notificationLevel : 0;
+
+    /* Shuffle input files before we start assessing how much sample datA to load.
+       The purpose of the shuffle is to pick random samples when the sample
+       set is larger than what we can load in memory. */
+    DISPLAYLEVEL(3, "Shuffling input files\n");
+    DiB_shuffle(fileNamesTable, nbFiles);
+
+    /* Figure out how much sample data to load with how many samples */
+    fs = DiB_fileStats(fileNamesTable, nbFiles, chunkSize, displayLevel);
+
+    {
+        int const memMult = params ? MEMMULT :
+                            coverParams ? COVER_MEMMULT:
+                            FASTCOVER_MEMMULT;
+        size_t const maxMem =  DiB_findMaxMem(fs.totalSizeToLoad * memMult) / memMult;
+        /* Limit the size of the training data to the free memory */
+        /* Limit the size of the training data to 2GB */
+        /* TODO: there is opportunity to stop DiB_fileStats() early when the data limit is reached */
+        loadedSize = (size_t)MIN( MIN((S64)maxMem, fs.totalSizeToLoad), MAX_SAMPLES_SIZE );
+        if (memLimit != 0) {
+            DISPLAYLEVEL(2, "!  Warning : setting manual memory limit for dictionary training data at %u MB \n",
+                (unsigned)(memLimit / (1 MB)));
+            loadedSize = (size_t)MIN(loadedSize, memLimit);
+        }
+        srcBuffer = malloc(loadedSize+NOISELENGTH);
+        sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t));
+    }
+
     /* Checks */
     if ((!sampleSizes) || (!srcBuffer) || (!dictBuffer))
         EXM_THROW(12, "not enough memory for DiB_trainFiles");   /* should not happen */
@@ -301,31 +363,32 @@ int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
         DISPLAYLEVEL(2, "!  Alternatively, split files into fixed-size blocks representative of samples, with -B# \n");
         EXM_THROW(14, "nb of samples too low");   /* we now clearly forbid this case */
     }
-    if (fs.totalSizeToLoad < (unsigned long long)maxDictSize * 8) {
+    if (fs.totalSizeToLoad < (S64)maxDictSize * 8) {
         DISPLAYLEVEL(2, "!  Warning : data size of samples too small for target dictionary size \n");
         DISPLAYLEVEL(2, "!  Samples should be about 100x larger than target dictionary size \n");
     }
 
     /* init */
-    if (loadedSize < fs.totalSizeToLoad)
-        DISPLAYLEVEL(1, "Not enough memory; training on %u MB only...\n", (unsigned)(loadedSize >> 20));
+    if ((S64)loadedSize < fs.totalSizeToLoad)
+        DISPLAYLEVEL(1, "Training samples set too large (%u MB); training on %u MB only...\n",
+            (unsigned)(fs.totalSizeToLoad / (1 MB)),
+            (unsigned)(loadedSize / (1 MB)));
 
     /* Load input buffer */
-    DISPLAYLEVEL(3, "Shuffling input files\n");
-    DiB_shuffle(fileNamesTable, nbFiles);
-
-    DiB_loadFiles(srcBuffer, &loadedSize, sampleSizes, fs.nbSamples, fileNamesTable, nbFiles, chunkSize, displayLevel);
+    nbSamplesLoaded = DiB_loadFiles(
+        srcBuffer, &loadedSize, sampleSizes, fs.nbSamples, fileNamesTable,
+        nbFiles, chunkSize, displayLevel);
 
     {   size_t dictSize;
         if (params) {
             DiB_fillNoise((char*)srcBuffer + loadedSize, NOISELENGTH);   /* guard band, for end of buffer condition */
-            dictSize = ZDICT_trainFromBuffer_unsafe_legacy(dictBuffer, maxDictSize,
-                                                           srcBuffer, sampleSizes, fs.nbSamples,
-                                                           *params);
+            dictSize = ZDICT_trainFromBuffer_legacy(dictBuffer, maxDictSize,
+                                                    srcBuffer, sampleSizes, nbSamplesLoaded,
+                                                    *params);
         } else if (coverParams) {
             if (optimize) {
               dictSize = ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, maxDictSize,
-                                                             srcBuffer, sampleSizes, fs.nbSamples,
+                                                             srcBuffer, sampleSizes, nbSamplesLoaded,
                                                              coverParams);
               if (!ZDICT_isError(dictSize)) {
                   unsigned splitPercentage = (unsigned)(coverParams->splitPoint * 100);
@@ -334,13 +397,13 @@ int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
               }
             } else {
               dictSize = ZDICT_trainFromBuffer_cover(dictBuffer, maxDictSize, srcBuffer,
-                                                     sampleSizes, fs.nbSamples, *coverParams);
+                                                     sampleSizes, nbSamplesLoaded, *coverParams);
             }
         } else {
             assert(fastCoverParams != NULL);
             if (optimize) {
               dictSize = ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, maxDictSize,
-                                                              srcBuffer, sampleSizes, fs.nbSamples,
+                                                              srcBuffer, sampleSizes, nbSamplesLoaded,
                                                               fastCoverParams);
               if (!ZDICT_isError(dictSize)) {
                 unsigned splitPercentage = (unsigned)(fastCoverParams->splitPoint * 100);
@@ -350,7 +413,7 @@ int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
               }
             } else {
               dictSize = ZDICT_trainFromBuffer_fastCover(dictBuffer, maxDictSize, srcBuffer,
-                                                        sampleSizes, fs.nbSamples, *fastCoverParams);
+                                                        sampleSizes, nbSamplesLoaded, *fastCoverParams);
             }
         }
         if (ZDICT_isError(dictSize)) {
diff --git a/programs/dibio.h b/programs/dibio.h
index 682723d6a54b..666c1e661800 100644
--- a/programs/dibio.h
+++ b/programs/dibio.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * Copyright (c) Yann Collet, Facebook, Inc.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -19,7 +19,7 @@
 *  Dependencies
 ***************************************/
 #define ZDICT_STATIC_LINKING_ONLY
-#include "../lib/dictBuilder/zdict.h"     /* ZDICT_params_t */
+#include "../lib/zdict.h"     /* ZDICT_params_t */
 
 
 /*-*************************************
@@ -31,9 +31,9 @@
     `parameters` is optional and can be provided with values set to 0, meaning "default".
     @return : 0 == ok. Any other : error.
 */
-int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
-                       const char** fileNamesTable, unsigned nbFiles, size_t chunkSize,
+int DiB_trainFromFiles(const char* dictFileName, size_t maxDictSize,
+                       const char** fileNamesTable, int nbFiles, size_t chunkSize,
                        ZDICT_legacy_params_t* params, ZDICT_cover_params_t* coverParams,
-                       ZDICT_fastCover_params_t* fastCoverParams, int optimize);
+                       ZDICT_fastCover_params_t* fastCoverParams, int optimize, unsigned memLimit);
 
 #endif
diff --git a/programs/fileio.c b/programs/fileio.c
index 65f2d531a81d..5338fa62955b 100644
--- a/programs/fileio.c
+++ b/programs/fileio.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * Copyright (c) Yann Collet, Facebook, Inc.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -25,9 +25,10 @@
 ***************************************/
 #include "platform.h"   /* Large Files support, SET_BINARY_MODE */
 #include "util.h"       /* UTIL_getFileSize, UTIL_isRegularFile, UTIL_isSameFile */
-#include <stdio.h>      /* fprintf, fopen, fread, _fileno, stdin, stdout */
+#include <stdio.h>      /* fprintf, open, fdopen, fread, _fileno, stdin, stdout */
 #include <stdlib.h>     /* malloc, free */
 #include <string.h>     /* strcmp, strlen */
+#include <fcntl.h>      /* O_WRONLY */
 #include <assert.h>
 #include <errno.h>      /* errno */
 #include <limits.h>     /* INT_MAX */
@@ -44,8 +45,7 @@
 
 #define ZSTD_STATIC_LINKING_ONLY   /* ZSTD_magicNumber, ZSTD_frameHeaderSize_max */
 #include "../lib/zstd.h"
-#include "../lib/common/zstd_errors.h"  /* ZSTD_error_frameParameter_windowTooLarge */
-#include "../lib/compress/zstd_compress_internal.h"
+#include "../lib/zstd_errors.h"  /* ZSTD_error_frameParameter_windowTooLarge */
 
 #if defined(ZSTD_GZCOMPRESS) || defined(ZSTD_GZDECOMPRESS)
 #  include <zlib.h>
@@ -74,16 +74,29 @@
 
 #define FNSPACE 30
 
+/* Default file permissions 0666 (modulated by umask) */
+#if !defined(_WIN32)
+/* These macros aren't defined on windows. */
+#define DEFAULT_FILE_PERMISSIONS (S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH)
+#else
+#define DEFAULT_FILE_PERMISSIONS (0666)
+#endif
+
 /*-*************************************
 *  Macros
 ***************************************/
+#define KB *(1 <<10)
+#define MB *(1 <<20)
+#define GB *(1U<<30)
+#undef MAX
+#define MAX(a,b) ((a)>(b) ? (a) : (b))
 
 struct FIO_display_prefs_s {
     int displayLevel;   /* 0 : no display;  1: errors;  2: + result + interaction + warnings;  3: + progression;  4: + information */
-    U32 noProgress;
+    FIO_progressSetting_e progressSetting;
 };
 
-static FIO_display_prefs_t g_display_prefs = {2, 0};
+static FIO_display_prefs_t g_display_prefs = {2, FIO_ps_auto};
 
 #define DISPLAY(...)         fprintf(stderr, __VA_ARGS__)
 #define DISPLAYOUT(...)      fprintf(stdout, __VA_ARGS__)
@@ -92,10 +105,10 @@ static FIO_display_prefs_t g_display_prefs = {2, 0};
 static const U64 g_refreshRate = SEC_TO_MICRO / 6;
 static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER;
 
-#define READY_FOR_UPDATE() (!g_display_prefs.noProgress && UTIL_clockSpanMicro(g_displayClock) > g_refreshRate)
+#define READY_FOR_UPDATE() ((g_display_prefs.progressSetting != FIO_ps_never) && UTIL_clockSpanMicro(g_displayClock) > g_refreshRate)
 #define DELAY_NEXT_UPDATE() { g_displayClock = UTIL_getTime(); }
 #define DISPLAYUPDATE(l, ...) {                              \
-        if (g_display_prefs.displayLevel>=l && !g_display_prefs.noProgress) { \
+        if (g_display_prefs.displayLevel>=l && (g_display_prefs.progressSetting != FIO_ps_never)) { \
             if (READY_FOR_UPDATE() || (g_display_prefs.displayLevel>=4)) { \
                 DELAY_NEXT_UPDATE();                         \
                 DISPLAY(__VA_ARGS__);                        \
@@ -294,6 +307,7 @@ struct FIO_prefs_s {
     int blockSize;
     int overlapLog;
     U32 adaptiveMode;
+    U32 useRowMatchFinder;
     int rsyncable;
     int minAdaptLevel;
     int maxAdaptLevel;
@@ -306,7 +320,7 @@ struct FIO_prefs_s {
     size_t targetCBlockSize;
     int srcSizeHint;
     int testMode;
-    ZSTD_literalCompressionMode_e literalCompressionMode;
+    ZSTD_paramSwitch_e literalCompressionMode;
 
     /* IO preferences */
     U32 removeSrcFile;
@@ -319,6 +333,7 @@ struct FIO_prefs_s {
     int excludeCompressedFiles;
     int patchFromMode;
     int contentSize;
+    int allowBlockDevices;
 };
 
 /*-*************************************
@@ -377,8 +392,9 @@ FIO_prefs_t* FIO_createPreferences(void)
     ret->targetCBlockSize = 0;
     ret->srcSizeHint = 0;
     ret->testMode = 0;
-    ret->literalCompressionMode = ZSTD_lcm_auto;
+    ret->literalCompressionMode = ZSTD_ps_auto;
     ret->excludeCompressedFiles = 0;
+    ret->allowBlockDevices = 0;
     return ret;
 }
 
@@ -414,7 +430,7 @@ void FIO_freeContext(FIO_ctx_t* const fCtx)
 
 void FIO_setNotificationLevel(int level) { g_display_prefs.displayLevel=level; }
 
-void FIO_setNoProgress(unsigned noProgress) { g_display_prefs.noProgress = noProgress; }
+void FIO_setProgressSetting(FIO_progressSetting_e setting) { g_display_prefs.progressSetting = setting; }
 
 
 /*-*************************************
@@ -446,6 +462,8 @@ void FIO_setNbWorkers(FIO_prefs_t* const prefs, int nbWorkers) {
 
 void FIO_setExcludeCompressedFile(FIO_prefs_t* const prefs, int excludeCompressedFiles) { prefs->excludeCompressedFiles = excludeCompressedFiles; }
 
+void FIO_setAllowBlockDevices(FIO_prefs_t* const prefs, int allowBlockDevices) { prefs->allowBlockDevices = allowBlockDevices; }
+
 void FIO_setBlockSize(FIO_prefs_t* const prefs, int blockSize) {
     if (blockSize && prefs->nbWorkers==0)
         DISPLAYLEVEL(2, "Setting block size is useless in single-thread mode \n");
@@ -464,6 +482,10 @@ void FIO_setAdaptiveMode(FIO_prefs_t* const prefs, unsigned adapt) {
     prefs->adaptiveMode = adapt;
 }
 
+void FIO_setUseRowMatchFinder(FIO_prefs_t* const prefs, int useRowMatchFinder) {
+    prefs->useRowMatchFinder = useRowMatchFinder;
+}
+
 void FIO_setRsyncable(FIO_prefs_t* const prefs, int rsyncable) {
     if ((rsyncable>0) && (prefs->nbWorkers==0))
         EXM_THROW(1, "Rsyncable mode is not compatible with single thread mode \n");
@@ -488,7 +510,7 @@ void FIO_setTestMode(FIO_prefs_t* const prefs, int testMode) {
 
 void FIO_setLiteralCompressionMode(
         FIO_prefs_t* const prefs,
-        ZSTD_literalCompressionMode_e mode) {
+        ZSTD_paramSwitch_e mode) {
     prefs->literalCompressionMode = mode;
 }
 
@@ -584,11 +606,12 @@ static int FIO_removeFile(const char* path)
 }
 
 /** FIO_openSrcFile() :
- *  condition : `srcFileName` must be non-NULL.
+ *  condition : `srcFileName` must be non-NULL. `prefs` may be NULL.
  * @result : FILE* to `srcFileName`, or NULL if it fails */
-static FILE* FIO_openSrcFile(const char* srcFileName)
+static FILE* FIO_openSrcFile(const FIO_prefs_t* const prefs, const char* srcFileName)
 {
     stat_t statbuf;
+    int allowBlockDevices = prefs != NULL ? prefs->allowBlockDevices : 0;
     assert(srcFileName != NULL);
     if (!strcmp (srcFileName, stdinmark)) {
         DISPLAYLEVEL(4,"Using stdin for input \n");
@@ -604,6 +627,7 @@ static FILE* FIO_openSrcFile(const char* srcFileName)
 
     if (!UTIL_isRegularFileStat(&statbuf)
      && !UTIL_isFIFOStat(&statbuf)
+     && !(allowBlockDevices && UTIL_isBlockDevStat(&statbuf))
     ) {
         DISPLAYLEVEL(1, "zstd: %s is not a regular file -- ignored \n",
                         srcFileName);
@@ -622,7 +646,8 @@ static FILE* FIO_openSrcFile(const char* srcFileName)
  * @result : FILE* to `dstFileName`, or NULL if it fails */
 static FILE*
 FIO_openDstFile(FIO_ctx_t* fCtx, FIO_prefs_t* const prefs,
-                const char* srcFileName, const char* dstFileName)
+                const char* srcFileName, const char* dstFileName,
+                const int mode)
 {
     if (prefs->testMode) return NULL;  /* do not open file in test mode */
 
@@ -649,7 +674,6 @@ FIO_openDstFile(FIO_ctx_t* fCtx, FIO_prefs_t* const prefs,
 
     if (UTIL_isRegularFile(dstFileName)) {
         /* Check if destination file already exists */
-        FILE* const fCheck = fopen( dstFileName, "rb" );
 #if !defined(_WIN32)
         /* this test does not work on Windows :
          * `NUL` and `nul` are detected as regular files */
@@ -658,31 +682,41 @@ FIO_openDstFile(FIO_ctx_t* fCtx, FIO_prefs_t* const prefs,
                         dstFileName);
         }
 #endif
-        if (fCheck != NULL) {  /* dst file exists, authorization prompt */
-            fclose(fCheck);
-            if (!prefs->overwrite) {
-                if (g_display_prefs.displayLevel <= 1) {
-                    /* No interaction possible */
-                    DISPLAY("zstd: %s already exists; not overwritten  \n",
-                            dstFileName);
-                    return NULL;
-                }
-                DISPLAY("zstd: %s already exists; ", dstFileName);
-                if (UTIL_requireUserConfirmation("overwrite (y/n) ? ", "Not overwritten  \n", "yY", fCtx->hasStdinInput))
-                    return NULL;
+        if (!prefs->overwrite) {
+            if (g_display_prefs.displayLevel <= 1) {
+                /* No interaction possible */
+                DISPLAY("zstd: %s already exists; not overwritten  \n",
+                        dstFileName);
+                return NULL;
             }
-            /* need to unlink */
-            FIO_removeFile(dstFileName);
-    }   }
+            DISPLAY("zstd: %s already exists; ", dstFileName);
+            if (UTIL_requireUserConfirmation("overwrite (y/n) ? ", "Not overwritten  \n", "yY", fCtx->hasStdinInput))
+                return NULL;
+        }
+        /* need to unlink */
+        FIO_removeFile(dstFileName);
+    }
 
-    {   FILE* const f = fopen( dstFileName, "wb" );
+    {
+#if defined(_WIN32)
+        /* Windows requires opening the file as a "binary" file to avoid
+         * mangling. This macro doesn't exist on unix. */
+        const int openflags = O_WRONLY|O_CREAT|O_TRUNC|O_BINARY;
+        const int fd = _open(dstFileName, openflags, mode);
+        FILE* f = NULL;
+        if (fd != -1) {
+            f = _fdopen(fd, "wb");
+        }
+#else
+        const int openflags = O_WRONLY|O_CREAT|O_TRUNC;
+        const int fd = open(dstFileName, openflags, mode);
+        FILE* f = NULL;
+        if (fd != -1) {
+            f = fdopen(fd, "wb");
+        }
+#endif
         if (f == NULL) {
             DISPLAYLEVEL(1, "zstd: %s: %s\n", dstFileName, strerror(errno));
-        } else if (srcFileName != NULL
-               && strcmp (srcFileName, stdinmark)
-               && strcmp(dstFileName, nulmark) ) {
-            /* reduce rights on newly created dst file while compression is ongoing */
-            UTIL_chmod(dstFileName, NULL, 00600);
         }
         return f;
     }
@@ -698,29 +732,43 @@ static size_t FIO_createDictBuffer(void** bufferPtr, const char* fileName, FIO_p
 {
     FILE* fileHandle;
     U64 fileSize;
+    stat_t statbuf;
 
     assert(bufferPtr != NULL);
     *bufferPtr = NULL;
     if (fileName == NULL) return 0;
 
     DISPLAYLEVEL(4,"Loading %s as dictionary \n", fileName);
+
+    if (!UTIL_stat(fileName, &statbuf)) {
+        EXM_THROW(31, "Stat failed on dictionary file %s: %s", fileName, strerror(errno));
+    }
+
+    if (!UTIL_isRegularFileStat(&statbuf)) {
+        EXM_THROW(32, "Dictionary %s must be a regular file.", fileName);
+    }
+
     fileHandle = fopen(fileName, "rb");
-    if (fileHandle==NULL) EXM_THROW(31, "%s: %s", fileName, strerror(errno));
 
-    fileSize = UTIL_getFileSize(fileName);
+    if (fileHandle == NULL) {
+        EXM_THROW(33, "Couldn't open dictionary %s: %s", fileName, strerror(errno));
+    }
+
+    fileSize = UTIL_getFileSizeStat(&statbuf);
     {
         size_t const dictSizeMax = prefs->patchFromMode ? prefs->memLimit : DICTSIZE_MAX;
         if (fileSize >  dictSizeMax) {
-            EXM_THROW(32, "Dictionary file %s is too large (> %u bytes)",
+            EXM_THROW(34, "Dictionary file %s is too large (> %u bytes)",
                             fileName,  (unsigned)dictSizeMax);   /* avoid extreme cases */
         }
     }
     *bufferPtr = malloc((size_t)fileSize);
     if (*bufferPtr==NULL) EXM_THROW(34, "%s", strerror(errno));
     {   size_t const readSize = fread(*bufferPtr, 1, (size_t)fileSize, fileHandle);
-        if (readSize != fileSize)
+        if (readSize != fileSize) {
             EXM_THROW(35, "Error reading dictionary file %s : %s",
                     fileName, strerror(errno));
+        }
     }
     fclose(fileHandle);
     return (size_t)fileSize;
@@ -840,7 +888,7 @@ static void FIO_adjustMemLimitForPatchFromMode(FIO_prefs_t* const prefs,
 /* FIO_removeMultiFilesWarning() :
  * Returns 1 if the console should abort, 0 if console should proceed.
  * This function handles logic when processing multiple files with -o, displaying the appropriate warnings/prompts.
- * 
+ *
  * If -f is specified, or there is just 1 file, zstd will always proceed as usual.
  * If --rm is specified, there will be a prompt asking for user confirmation.
  *         If -f is specified with --rm, zstd will proceed as usual
@@ -855,26 +903,25 @@ static int FIO_removeMultiFilesWarning(FIO_ctx_t* const fCtx, const FIO_prefs_t*
     if (fCtx->nbFilesTotal > 1 && !prefs->overwrite) {
         if (g_display_prefs.displayLevel <= displayLevelCutoff) {
             if (prefs->removeSrcFile) {
-                DISPLAYLEVEL(1, "zstd: Aborting... not deleting files and processing into dst: %s", outFileName);
+                DISPLAYLEVEL(1, "zstd: Aborting... not deleting files and processing into dst: %s\n", outFileName);
                 error =  1;
             }
         } else {
             if (!strcmp(outFileName, stdoutmark)) {
-                DISPLAYLEVEL(2, "zstd: WARNING: all input files will be processed and concatenated into stdout. ");
+                DISPLAYLEVEL(2, "zstd: WARNING: all input files will be processed and concatenated into stdout. \n");
             } else {
-                DISPLAYLEVEL(2, "zstd: WARNING: all input files will be processed and concatenated into a single output file: %s ", outFileName);
+                DISPLAYLEVEL(2, "zstd: WARNING: all input files will be processed and concatenated into a single output file: %s \n", outFileName);
             }
-            DISPLAYLEVEL(2, "\nThe concatenated output CANNOT regenerate the original directory tree. ")
+            DISPLAYLEVEL(2, "The concatenated output CANNOT regenerate the original directory tree. \n")
             if (prefs->removeSrcFile) {
                 if (fCtx->hasStdoutOutput) {
-                    DISPLAYLEVEL(1, "\nAborting. Use -f if you really want to delete the files and output to stdout");
+                    DISPLAYLEVEL(1, "Aborting. Use -f if you really want to delete the files and output to stdout\n");
                     error = 1;
                 } else {
                     error = g_display_prefs.displayLevel > displayLevelCutoff && UTIL_requireUserConfirmation("This is a destructive operation. Proceed? (y/n): ", "Aborting...", "yY", fCtx->hasStdinInput);
                 }
             }
         }
-        DISPLAY("\n");
     }
     return error;
 }
@@ -897,6 +944,15 @@ typedef struct {
     ZSTD_CStream* cctx;
 } cRess_t;
 
+/** ZSTD_cycleLog() :
+ *  condition for correct operation : hashLog > 1 */
+static U32 ZSTD_cycleLog(U32 hashLog, ZSTD_strategy strat)
+{
+    U32 const btScale = ((U32)strat >= (U32)ZSTD_btlazy2);
+    assert(hashLog > 1);
+    return hashLog - btScale;
+}
+
 static void FIO_adjustParamsForPatchFromMode(FIO_prefs_t* const prefs,
                                     ZSTD_compressionParameters* comprParams,
                                     unsigned long long const dictSize,
@@ -908,7 +964,7 @@ static void FIO_adjustParamsForPatchFromMode(FIO_prefs_t* const prefs,
     FIO_adjustMemLimitForPatchFromMode(prefs, dictSize, maxSrcFileSize);
     if (fileWindowLog > ZSTD_WINDOWLOG_MAX)
         DISPLAYLEVEL(1, "Max window log exceeded by file (compression ratio will suffer)\n");
-    comprParams->windowLog = MIN(ZSTD_WINDOWLOG_MAX, fileWindowLog);
+    comprParams->windowLog = MAX(ZSTD_WINDOWLOG_MIN, MIN(ZSTD_WINDOWLOG_MAX, fileWindowLog));
     if (fileWindowLog > ZSTD_cycleLog(cParams.chainLog, cParams.strategy)) {
         if (!prefs->ldmFlag)
             DISPLAYLEVEL(1, "long mode automatically triggered\n");
@@ -919,7 +975,7 @@ static void FIO_adjustParamsForPatchFromMode(FIO_prefs_t* const prefs,
         DISPLAYLEVEL(1, "- Use --single-thread mode in the zstd cli\n");
         DISPLAYLEVEL(1, "- Set a larger targetLength (eg. --zstd=targetLength=4096)\n");
         DISPLAYLEVEL(1, "- Set a larger chainLog (eg. --zstd=chainLog=%u)\n", ZSTD_CHAINLOG_MAX);
-        DISPLAYLEVEL(1, "Also consdier playing around with searchLog and hashLog\n");
+        DISPLAYLEVEL(1, "Also consider playing around with searchLog and hashLog\n");
     }
 }
 
@@ -976,6 +1032,7 @@ static cRess_t FIO_createCResources(FIO_prefs_t* const prefs,
     if (prefs->ldmHashRateLog != FIO_LDM_PARAM_NOTSET) {
         CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_ldmHashRateLog, prefs->ldmHashRateLog) );
     }
+    CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_useRowMatchFinder, prefs->useRowMatchFinder));
     /* compression parameters */
     CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_windowLog, (int)comprParams.windowLog) );
     CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_chainLog, (int)comprParams.chainLog) );
@@ -983,7 +1040,7 @@ static cRess_t FIO_createCResources(FIO_prefs_t* const prefs,
     CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_searchLog, (int)comprParams.searchLog) );
     CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_minMatch, (int)comprParams.minMatch) );
     CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_targetLength, (int)comprParams.targetLength) );
-    CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_strategy, comprParams.strategy) );
+    CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_strategy, (int)comprParams.strategy) );
     CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_literalCompressionMode, (int)prefs->literalCompressionMode) );
     CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_enableDedicatedDictSearch, 1) );
     /* multi-threading */
@@ -1288,6 +1345,7 @@ FIO_compressZstdFrame(FIO_ctx_t* const fCtx,
     FILE* const dstFile = ress.dstFile;
     U64 compressedfilesize = 0;
     ZSTD_EndDirective directive = ZSTD_e_continue;
+    U64 pledgedSrcSize = ZSTD_CONTENTSIZE_UNKNOWN;
 
     /* stats */
     ZSTD_frameProgression previous_zfp_update = { 0, 0, 0, 0, 0, 0 };
@@ -1298,16 +1356,31 @@ FIO_compressZstdFrame(FIO_ctx_t* const fCtx,
     unsigned inputPresented = 0;
     unsigned inputBlocked = 0;
     unsigned lastJobID = 0;
+    UTIL_HumanReadableSize_t const file_hrs = UTIL_makeHumanReadableSize(fileSize);
 
     DISPLAYLEVEL(6, "compression using zstd format \n");
 
     /* init */
     if (fileSize != UTIL_FILESIZE_UNKNOWN) {
+        pledgedSrcSize = fileSize;
         CHECK(ZSTD_CCtx_setPledgedSrcSize(ress.cctx, fileSize));
     } else if (prefs->streamSrcSize > 0) {
       /* unknown source size; use the declared stream size */
+      pledgedSrcSize = prefs->streamSrcSize;
       CHECK( ZSTD_CCtx_setPledgedSrcSize(ress.cctx, prefs->streamSrcSize) );
     }
+
+    {
+        int windowLog;
+        UTIL_HumanReadableSize_t windowSize;
+        CHECK(ZSTD_CCtx_getParameter(ress.cctx, ZSTD_c_windowLog, &windowLog));
+        if (windowLog == 0) {
+            const ZSTD_compressionParameters cParams = ZSTD_getCParams(compressionLevel, fileSize, 0);
+            windowLog = cParams.windowLog;
+        }
+        windowSize = UTIL_makeHumanReadableSize(MAX(1ULL, MIN(1ULL << windowLog, pledgedSrcSize)));
+        DISPLAYLEVEL(4, "Decompression will require %.*f%s of memory\n", windowSize.precision, windowSize.value, windowSize.suffix);
+    }
     (void)srcFileName;
 
     /* Main compression loop */
@@ -1350,34 +1423,38 @@ FIO_compressZstdFrame(FIO_ctx_t* const fCtx,
             /* display notification; and adapt compression level */
             if (READY_FOR_UPDATE()) {
                 ZSTD_frameProgression const zfp = ZSTD_getFrameProgression(ress.cctx);
-                double const cShare = (double)zfp.produced / (zfp.consumed + !zfp.consumed/*avoid div0*/) * 100;
+                double const cShare = (double)zfp.produced / (double)(zfp.consumed + !zfp.consumed/*avoid div0*/) * 100;
+                UTIL_HumanReadableSize_t const buffered_hrs = UTIL_makeHumanReadableSize(zfp.ingested - zfp.consumed);
+                UTIL_HumanReadableSize_t const consumed_hrs = UTIL_makeHumanReadableSize(zfp.consumed);
+                UTIL_HumanReadableSize_t const produced_hrs = UTIL_makeHumanReadableSize(zfp.produced);
 
                 /* display progress notifications */
                 if (g_display_prefs.displayLevel >= 3) {
-                    DISPLAYUPDATE(3, "\r(L%i) Buffered :%4u MB - Consumed :%4u MB - Compressed :%4u MB => %.2f%% ",
+                    DISPLAYUPDATE(3, "\r(L%i) Buffered :%6.*f%4s - Consumed :%6.*f%4s - Compressed :%6.*f%4s => %.2f%% ",
                                 compressionLevel,
-                                (unsigned)((zfp.ingested - zfp.consumed) >> 20),
-                                (unsigned)(zfp.consumed >> 20),
-                                (unsigned)(zfp.produced >> 20),
+                                buffered_hrs.precision, buffered_hrs.value, buffered_hrs.suffix,
+                                consumed_hrs.precision, consumed_hrs.value, consumed_hrs.suffix,
+                                produced_hrs.precision, produced_hrs.value, produced_hrs.suffix,
                                 cShare );
-                } else {   /* summarized notifications if == 2 */
-                    DISPLAYLEVEL(2, "\r%79s\r", "");    /* Clear out the current displayed line */
+                } else if (g_display_prefs.displayLevel >= 2 || g_display_prefs.progressSetting == FIO_ps_always) {
+                    /* Require level 2 or forcibly displayed progress counter for summarized updates */
+                    DISPLAYLEVEL(1, "\r%79s\r", "");    /* Clear out the current displayed line */
                     if (fCtx->nbFilesTotal > 1) {
                         size_t srcFileNameSize = strlen(srcFileName);
                         /* Ensure that the string we print is roughly the same size each time */
                         if (srcFileNameSize > 18) {
                             const char* truncatedSrcFileName = srcFileName + srcFileNameSize - 15;
-                            DISPLAYLEVEL(2, "Compress: %u/%u files. Current: ...%s ",
-                                         fCtx->currFileIdx+1, fCtx->nbFilesTotal, truncatedSrcFileName);
+                            DISPLAYLEVEL(1, "Compress: %u/%u files. Current: ...%s ",
+                                        fCtx->currFileIdx+1, fCtx->nbFilesTotal, truncatedSrcFileName);
                         } else {
-                            DISPLAYLEVEL(2, "Compress: %u/%u files. Current: %*s ",
-                                         fCtx->currFileIdx+1, fCtx->nbFilesTotal, (int)(18-srcFileNameSize), srcFileName);
+                            DISPLAYLEVEL(1, "Compress: %u/%u files. Current: %*s ",
+                                        fCtx->currFileIdx+1, fCtx->nbFilesTotal, (int)(18-srcFileNameSize), srcFileName);
                         }
                     }
-                    DISPLAYLEVEL(2, "Read : %2u ", (unsigned)(zfp.consumed >> 20));
+                    DISPLAYLEVEL(1, "Read:%6.*f%4s ", consumed_hrs.precision, consumed_hrs.value, consumed_hrs.suffix);
                     if (fileSize != UTIL_FILESIZE_UNKNOWN)
-                        DISPLAYLEVEL(2, "/ %2u ", (unsigned)(fileSize >> 20));
-                    DISPLAYLEVEL(2, "MB ==> %2.f%%", cShare);
+                        DISPLAYLEVEL(2, "/%6.*f%4s", file_hrs.precision, file_hrs.value, file_hrs.suffix);
+                    DISPLAYLEVEL(1, " ==> %2.f%%", cShare);
                     DELAY_NEXT_UPDATE();
                 }
 
@@ -1499,7 +1576,7 @@ FIO_compressFilename_internal(FIO_ctx_t* const fCtx,
     U64 readsize = 0;
     U64 compressedfilesize = 0;
     U64 const fileSize = UTIL_getFileSize(srcFileName);
-    DISPLAYLEVEL(5, "%s: %u bytes \n", srcFileName, (unsigned)fileSize);
+    DISPLAYLEVEL(5, "%s: %llu bytes \n", srcFileName, (unsigned long long)fileSize);
 
     /* compression format selection */
     switch (prefs->compressionType) {
@@ -1545,18 +1622,22 @@ FIO_compressFilename_internal(FIO_ctx_t* const fCtx,
     fCtx->totalBytesOutput += (size_t)compressedfilesize;
     DISPLAYLEVEL(2, "\r%79s\r", "");
     if (g_display_prefs.displayLevel >= 2 &&
-        !fCtx->hasStdoutOutput && 
+        !fCtx->hasStdoutOutput &&
         (g_display_prefs.displayLevel >= 3 || fCtx->nbFilesTotal <= 1)) {
+        UTIL_HumanReadableSize_t hr_isize = UTIL_makeHumanReadableSize((U64) readsize);
+        UTIL_HumanReadableSize_t hr_osize = UTIL_makeHumanReadableSize((U64) compressedfilesize);
         if (readsize == 0) {
-            DISPLAYLEVEL(2,"%-20s :  (%6llu => %6llu bytes, %s) \n",
+            DISPLAYLEVEL(2,"%-20s :  (%6.*f%4s => %6.*f%4s, %s) \n",
                 srcFileName,
-                (unsigned long long)readsize, (unsigned long long) compressedfilesize,
+                hr_isize.precision, hr_isize.value, hr_isize.suffix,
+                hr_osize.precision, hr_osize.value, hr_osize.suffix,
                 dstFileName);
         } else {
-            DISPLAYLEVEL(2,"%-20s :%6.2f%%   (%6llu => %6llu bytes, %s) \n",
+            DISPLAYLEVEL(2,"%-20s :%6.2f%%   (%6.*f%4s => %6.*f%4s, %s) \n",
                 srcFileName,
-                (double)compressedfilesize / readsize * 100,
-                (unsigned long long)readsize, (unsigned long long) compressedfilesize,
+                (double)compressedfilesize / (double)readsize * 100,
+                hr_isize.precision, hr_isize.value, hr_isize.suffix,
+                hr_osize.precision, hr_osize.value, hr_osize.suffix,
                 dstFileName);
         }
     }
@@ -1593,23 +1674,27 @@ static int FIO_compressFilename_dstFile(FIO_ctx_t* const fCtx,
     int closeDstFile = 0;
     int result;
     stat_t statbuf;
-    int transfer_permissions = 0;
+    int transferMTime = 0;
     assert(ress.srcFile != NULL);
     if (ress.dstFile == NULL) {
+        int dstFilePermissions = DEFAULT_FILE_PERMISSIONS;
+        if ( strcmp (srcFileName, stdinmark)
+          && strcmp (dstFileName, stdoutmark)
+          && UTIL_stat(srcFileName, &statbuf)
+          && UTIL_isRegularFileStat(&statbuf) ) {
+            dstFilePermissions = statbuf.st_mode;
+            transferMTime = 1;
+        }
+
         closeDstFile = 1;
         DISPLAYLEVEL(6, "FIO_compressFilename_dstFile: opening dst: %s \n", dstFileName);
-        ress.dstFile = FIO_openDstFile(fCtx, prefs, srcFileName, dstFileName);
+        ress.dstFile = FIO_openDstFile(fCtx, prefs, srcFileName, dstFileName, dstFilePermissions);
         if (ress.dstFile==NULL) return 1;  /* could not open dstFileName */
         /* Must only be added after FIO_openDstFile() succeeds.
          * Otherwise we may delete the destination file if it already exists,
          * and the user presses Ctrl-C when asked if they wish to overwrite.
          */
         addHandler(dstFileName);
-
-        if ( strcmp (srcFileName, stdinmark)
-          && UTIL_stat(srcFileName, &statbuf)
-          && UTIL_isRegularFileStat(&statbuf) )
-            transfer_permissions = 1;
     }
 
     result = FIO_compressFilename_internal(fCtx, prefs, ress, dstFileName, srcFileName, compressionLevel);
@@ -1625,15 +1710,13 @@ static int FIO_compressFilename_dstFile(FIO_ctx_t* const fCtx,
             DISPLAYLEVEL(1, "zstd: %s: %s \n", dstFileName, strerror(errno));
             result=1;
         }
+        if (transferMTime) {
+            UTIL_utime(dstFileName, &statbuf);
+        }
         if ( (result != 0)  /* operation failure */
           && strcmp(dstFileName, stdoutmark)  /* special case : don't remove() stdout */
           ) {
             FIO_removeFile(dstFileName); /* remove compression artefact; note don't do anything special if remove() fails */
-        } else if (transfer_permissions) {
-            DISPLAYLEVEL(6, "FIO_compressFilename_dstFile: transferring permissions into dst: %s \n", dstFileName);
-            UTIL_setFileStat(dstFileName, &statbuf);
-        } else {
-            DISPLAYLEVEL(6, "FIO_compressFilename_dstFile: do not transfer permissions into dst: %s \n", dstFileName);
         }
     }
 
@@ -1692,7 +1775,7 @@ FIO_compressFilename_srcFile(FIO_ctx_t* const fCtx,
         return 0;
     }
 
-    ress.srcFile = FIO_openSrcFile(srcFileName);
+    ress.srcFile = FIO_openSrcFile(prefs, srcFileName);
     if (ress.srcFile == NULL) return 1;   /* srcFile could not be opened */
 
     result = FIO_compressFilename_dstFile(fCtx, prefs, ress, dstFileName, srcFileName, compressionLevel);
@@ -1713,6 +1796,50 @@ FIO_compressFilename_srcFile(FIO_ctx_t* const fCtx,
     return result;
 }
 
+static const char* checked_index(const char* options[], size_t length, size_t index) {
+    assert(index < length);
+    // Necessary to avoid warnings since -O3 will omit the above `assert`
+    (void) length;
+    return options[index];
+}
+
+#define INDEX(options, index) checked_index((options), sizeof(options)  / sizeof(char*), (index))
+
+void FIO_displayCompressionParameters(const FIO_prefs_t* prefs) {
+    static const char* formatOptions[5] = {ZSTD_EXTENSION, GZ_EXTENSION, XZ_EXTENSION,
+                                           LZMA_EXTENSION, LZ4_EXTENSION};
+    static const char* sparseOptions[3] = {" --no-sparse", "", " --sparse"};
+    static const char* checkSumOptions[3] = {" --no-check", "", " --check"};
+    static const char* rowMatchFinderOptions[3] = {"", " --no-row-match-finder", " --row-match-finder"};
+    static const char* compressLiteralsOptions[3] = {"", " --compress-literals", " --no-compress-literals"};
+
+    assert(g_display_prefs.displayLevel >= 4);
+
+    DISPLAY("--format=%s", formatOptions[prefs->compressionType]);
+    DISPLAY("%s", INDEX(sparseOptions, prefs->sparseFileSupport));
+    DISPLAY("%s", prefs->dictIDFlag ? "" : " --no-dictID");
+    DISPLAY("%s", INDEX(checkSumOptions, prefs->checksumFlag));
+    DISPLAY(" --block-size=%d", prefs->blockSize);
+    if (prefs->adaptiveMode)
+        DISPLAY(" --adapt=min=%d,max=%d", prefs->minAdaptLevel, prefs->maxAdaptLevel);
+    DISPLAY("%s", INDEX(rowMatchFinderOptions, prefs->useRowMatchFinder));
+    DISPLAY("%s", prefs->rsyncable ? " --rsyncable" : "");
+    if (prefs->streamSrcSize)
+        DISPLAY(" --stream-size=%u", (unsigned) prefs->streamSrcSize);
+    if (prefs->srcSizeHint)
+        DISPLAY(" --size-hint=%d", prefs->srcSizeHint);
+    if (prefs->targetCBlockSize)
+        DISPLAY(" --target-compressed-block-size=%u", (unsigned) prefs->targetCBlockSize);
+    DISPLAY("%s", INDEX(compressLiteralsOptions, prefs->literalCompressionMode));
+    DISPLAY(" --memory=%u", prefs->memLimit ? prefs->memLimit : 128 MB);
+    DISPLAY(" --threads=%d", prefs->nbWorkers);
+    DISPLAY("%s", prefs->excludeCompressedFiles ? " --exclude-compressed" : "");
+    DISPLAY(" --%scontent-size", prefs->contentSize ? "" : "no-");
+    DISPLAY("\n");
+}
+
+#undef INDEX
+
 int FIO_compressFilename(FIO_ctx_t* const fCtx, FIO_prefs_t* const prefs, const char* dstFileName,
                          const char* srcFileName, const char* dictFileName,
                          int compressionLevel, ZSTD_compressionParameters comprParams)
@@ -1795,7 +1922,7 @@ int FIO_compressMultipleFilenames(FIO_ctx_t* const fCtx,
     int status;
     int error = 0;
     cRess_t ress = FIO_createCResources(prefs, dictFileName,
-        FIO_getLargestFileSize(inFileNamesTable, fCtx->nbFilesTotal),
+        FIO_getLargestFileSize(inFileNamesTable, (unsigned)fCtx->nbFilesTotal),
         compressionLevel, comprParams);
 
     /* init */
@@ -1805,7 +1932,7 @@ int FIO_compressMultipleFilenames(FIO_ctx_t* const fCtx,
             FIO_freeCResources(&ress);
             return 1;
         }
-        ress.dstFile = FIO_openDstFile(fCtx, prefs, NULL, outFileName);
+        ress.dstFile = FIO_openDstFile(fCtx, prefs, NULL, outFileName, DEFAULT_FILE_PERMISSIONS);
         if (ress.dstFile == NULL) {  /* could not open outFileName */
             error = 1;
         } else {
@@ -1821,7 +1948,7 @@ int FIO_compressMultipleFilenames(FIO_ctx_t* const fCtx,
         }
     } else {
         if (outMirroredRootDirName)
-            UTIL_mirrorSourceFilesDirectories(inFileNamesTable, fCtx->nbFilesTotal, outMirroredRootDirName);
+            UTIL_mirrorSourceFilesDirectories(inFileNamesTable, (unsigned)fCtx->nbFilesTotal, outMirroredRootDirName);
 
         for (; fCtx->currFileIdx < fCtx->nbFilesTotal; ++fCtx->currFileIdx) {
             const char* const srcFileName = inFileNamesTable[fCtx->currFileIdx];
@@ -1845,14 +1972,19 @@ int FIO_compressMultipleFilenames(FIO_ctx_t* const fCtx,
         }
 
         if (outDirName)
-            FIO_checkFilenameCollisions(inFileNamesTable , fCtx->nbFilesTotal);
+            FIO_checkFilenameCollisions(inFileNamesTable , (unsigned)fCtx->nbFilesTotal);
     }
 
     if (fCtx->nbFilesProcessed >= 1 && fCtx->nbFilesTotal > 1 && fCtx->totalBytesInput != 0) {
+        UTIL_HumanReadableSize_t hr_isize = UTIL_makeHumanReadableSize((U64) fCtx->totalBytesInput);
+        UTIL_HumanReadableSize_t hr_osize = UTIL_makeHumanReadableSize((U64) fCtx->totalBytesOutput);
+
         DISPLAYLEVEL(2, "\r%79s\r", "");
-        DISPLAYLEVEL(2, "%d files compressed : %.2f%%  (%6zu => %6zu bytes)\n", fCtx->nbFilesProcessed,
+        DISPLAYLEVEL(2, "%3d files compressed :%.2f%%   (%6.*f%4s => %6.*f%4s)\n",
+                        fCtx->nbFilesProcessed,
                         (double)fCtx->totalBytesOutput/((double)fCtx->totalBytesInput)*100,
-                        fCtx->totalBytesInput, fCtx->totalBytesOutput);
+                        hr_isize.precision, hr_isize.value, hr_isize.suffix,
+                        hr_osize.precision, hr_osize.value, hr_osize.suffix);
     }
 
     FIO_freeCResources(&ress);
@@ -1892,7 +2024,7 @@ static dRess_t FIO_createDResources(FIO_prefs_t* const prefs, const char* dictFi
         EXM_THROW(60, "Error: %s : can't create ZSTD_DStream", strerror(errno));
     CHECK( ZSTD_DCtx_setMaxWindowSize(ress.dctx, prefs->memLimit) );
     CHECK( ZSTD_DCtx_setParameter(ress.dctx, ZSTD_d_forceIgnoreChecksum, !prefs->checksumFlag));
-    
+
     ress.srcBufferSize = ZSTD_DStreamInSize();
     ress.srcBuffer = malloc(ress.srcBufferSize);
     ress.dstBufferSize = ZSTD_DStreamOutSize();
@@ -2099,7 +2231,7 @@ FIO_decompressZstdFrame(FIO_ctx_t* const fCtx, dRess_t* ress, FILE* finput,
         if (srcFileLength>20) srcFileName += srcFileLength-20;
     }
 
-    ZSTD_resetDStream(ress->dctx);
+    ZSTD_DCtx_reset(ress->dctx, ZSTD_reset_session_only);
 
     /* Header loading : ensures ZSTD_getFrameHeader() will succeed */
     {   size_t const toDecode = ZSTD_FRAMEHEADERSIZE_MAX;
@@ -2114,6 +2246,8 @@ FIO_decompressZstdFrame(FIO_ctx_t* const fCtx, dRess_t* ress, FILE* finput,
         ZSTD_inBuffer  inBuff = { ress->srcBuffer, ress->srcBufferLoaded, 0 };
         ZSTD_outBuffer outBuff= { ress->dstBuffer, ress->dstBufferSize, 0 };
         size_t const readSizeHint = ZSTD_decompressStream(ress->dctx, &outBuff, &inBuff);
+        const int displayLevel = (g_display_prefs.progressSetting == FIO_ps_always) ? 1 : 2;
+        UTIL_HumanReadableSize_t const hrs = UTIL_makeHumanReadableSize(alreadyDecoded+frameSize);
         if (ZSTD_isError(readSizeHint)) {
             DISPLAYLEVEL(1, "%s : Decoding error (36) : %s \n",
                             srcFileName, ZSTD_getErrorName(readSizeHint));
@@ -2124,21 +2258,19 @@ FIO_decompressZstdFrame(FIO_ctx_t* const fCtx, dRess_t* ress, FILE* finput,
         /* Write block */
         storedSkips = FIO_fwriteSparse(ress->dstFile, ress->dstBuffer, outBuff.pos, prefs, storedSkips);
         frameSize += outBuff.pos;
-        if (!fCtx->hasStdoutOutput) {
-            if (fCtx->nbFilesTotal > 1) {
-                size_t srcFileNameSize = strlen(srcFileName);
-                if (srcFileNameSize > 18) {
-                    const char* truncatedSrcFileName = srcFileName + srcFileNameSize - 15;
-                    DISPLAYUPDATE(2, "\rDecompress: %2u/%2u files. Current: ...%s : %u MB...    ",
-                                    fCtx->currFileIdx+1, fCtx->nbFilesTotal, truncatedSrcFileName, (unsigned)((alreadyDecoded+frameSize)>>20) );
-                } else {
-                    DISPLAYUPDATE(2, "\rDecompress: %2u/%2u files. Current: %s : %u MB...    ",
-                                fCtx->currFileIdx+1, fCtx->nbFilesTotal, srcFileName, (unsigned)((alreadyDecoded+frameSize)>>20) );
-                }
+        if (fCtx->nbFilesTotal > 1) {
+            size_t srcFileNameSize = strlen(srcFileName);
+            if (srcFileNameSize > 18) {
+                const char* truncatedSrcFileName = srcFileName + srcFileNameSize - 15;
+                DISPLAYUPDATE(displayLevel, "\rDecompress: %2u/%2u files. Current: ...%s : %.*f%s...    ",
+                                fCtx->currFileIdx+1, fCtx->nbFilesTotal, truncatedSrcFileName, hrs.precision, hrs.value, hrs.suffix);
             } else {
-                DISPLAYUPDATE(2, "\r%-20.20s : %u MB...     ",
-                                srcFileName, (unsigned)((alreadyDecoded+frameSize)>>20) );
+                DISPLAYUPDATE(displayLevel, "\rDecompress: %2u/%2u files. Current: %s : %.*f%s...    ",
+                            fCtx->currFileIdx+1, fCtx->nbFilesTotal, srcFileName, hrs.precision, hrs.value, hrs.suffix);
             }
+        } else {
+            DISPLAYUPDATE(displayLevel, "\r%-20.20s : %.*f%s...     ",
+                            srcFileName, hrs.precision, hrs.value, hrs.suffix);
         }
 
         if (inBuff.pos > 0) {
@@ -2362,9 +2494,11 @@ FIO_decompressLz4Frame(dRess_t* ress, FILE* srcFile,
 
             /* Write Block */
             if (decodedBytes) {
+                UTIL_HumanReadableSize_t hrs;
                 storedSkips = FIO_fwriteSparse(ress->dstFile, ress->dstBuffer, decodedBytes, prefs, storedSkips);
                 filesize += decodedBytes;
-                DISPLAYUPDATE(2, "\rDecompressed : %u MB  ", (unsigned)(filesize>>20));
+                hrs = UTIL_makeHumanReadableSize(filesize);
+                DISPLAYUPDATE(2, "\rDecompressed : %.*f%s  ", hrs.precision, hrs.value, hrs.suffix);
             }
 
             if (!nextToLoad) break;
@@ -2472,10 +2606,10 @@ static int FIO_decompressFrames(FIO_ctx_t* const fCtx,
     fCtx->totalBytesOutput += (size_t)filesize;
     DISPLAYLEVEL(2, "\r%79s\r", "");
     /* No status message in pipe mode (stdin - stdout) or multi-files mode */
-    if (g_display_prefs.displayLevel >= 2) {
-        if (fCtx->nbFilesTotal <= 1 || g_display_prefs.displayLevel >= 3) {
-            DISPLAYLEVEL(2, "%-20s: %llu bytes \n", srcFileName, filesize);
-        }
+    if ((g_display_prefs.displayLevel >= 2 && fCtx->nbFilesTotal <= 1) ||
+        g_display_prefs.displayLevel >= 3 ||
+        g_display_prefs.progressSetting == FIO_ps_always) {
+        DISPLAYLEVEL(1, "\r%-20s: %llu bytes \n", srcFileName, filesize);
     }
 
     return 0;
@@ -2495,13 +2629,22 @@ static int FIO_decompressDstFile(FIO_ctx_t* const fCtx,
 {
     int result;
     stat_t statbuf;
-    int transfer_permissions = 0;
     int releaseDstFile = 0;
+    int transferMTime = 0;
 
     if ((ress.dstFile == NULL) && (prefs->testMode==0)) {
+        int dstFilePermissions = DEFAULT_FILE_PERMISSIONS;
+        if ( strcmp(srcFileName, stdinmark)   /* special case : don't transfer permissions from stdin */
+          && strcmp(dstFileName, stdoutmark)
+          && UTIL_stat(srcFileName, &statbuf)
+          && UTIL_isRegularFileStat(&statbuf) ) {
+            dstFilePermissions = statbuf.st_mode;
+            transferMTime = 1;
+        }
+
         releaseDstFile = 1;
 
-        ress.dstFile = FIO_openDstFile(fCtx, prefs, srcFileName, dstFileName);
+        ress.dstFile = FIO_openDstFile(fCtx, prefs, srcFileName, dstFileName, dstFilePermissions);
         if (ress.dstFile==NULL) return 1;
 
         /* Must only be added after FIO_openDstFile() succeeds.
@@ -2509,11 +2652,6 @@ static int FIO_decompressDstFile(FIO_ctx_t* const fCtx,
          * and the user presses Ctrl-C when asked if they wish to overwrite.
          */
         addHandler(dstFileName);
-
-        if ( strcmp(srcFileName, stdinmark)   /* special case : don't transfer permissions from stdin */
-          && UTIL_stat(srcFileName, &statbuf)
-          && UTIL_isRegularFileStat(&statbuf) )
-            transfer_permissions = 1;
     }
 
     result = FIO_decompressFrames(fCtx, ress, srcFile, prefs, dstFileName, srcFileName);
@@ -2527,12 +2665,14 @@ static int FIO_decompressDstFile(FIO_ctx_t* const fCtx,
             result = 1;
         }
 
+        if (transferMTime) {
+            UTIL_utime(dstFileName, &statbuf);
+        }
+
         if ( (result != 0)  /* operation failure */
           && strcmp(dstFileName, stdoutmark)  /* special case : don't remove() stdout */
           ) {
             FIO_removeFile(dstFileName);  /* remove decompression artefact; note: don't do anything special if remove() fails */
-        } else if ( transfer_permissions /* file permissions correctly extracted from src */ ) {
-            UTIL_setFileStat(dstFileName, &statbuf);  /* transfer file permissions from src into dst */
         }
     }
 
@@ -2555,7 +2695,7 @@ static int FIO_decompressSrcFile(FIO_ctx_t* const fCtx, FIO_prefs_t* const prefs
         return 1;
     }
 
-    srcFile = FIO_openSrcFile(srcFileName);
+    srcFile = FIO_openSrcFile(prefs, srcFileName);
     if (srcFile==NULL) return 1;
     ress.srcBufferLoaded = 0;
 
@@ -2734,7 +2874,7 @@ FIO_decompressMultipleFilenames(FIO_ctx_t* const fCtx,
             return 1;
         }
         if (!prefs->testMode) {
-            ress.dstFile = FIO_openDstFile(fCtx, prefs, NULL, outFileName);
+            ress.dstFile = FIO_openDstFile(fCtx, prefs, NULL, outFileName, DEFAULT_FILE_PERMISSIONS);
             if (ress.dstFile == 0) EXM_THROW(19, "cannot open %s", outFileName);
         }
         for (; fCtx->currFileIdx < fCtx->nbFilesTotal; fCtx->currFileIdx++) {
@@ -2747,7 +2887,7 @@ FIO_decompressMultipleFilenames(FIO_ctx_t* const fCtx,
                         strerror(errno));
     } else {
         if (outMirroredRootDirName)
-            UTIL_mirrorSourceFilesDirectories(srcNamesTable, fCtx->nbFilesTotal, outMirroredRootDirName);
+            UTIL_mirrorSourceFilesDirectories(srcNamesTable, (unsigned)fCtx->nbFilesTotal, outMirroredRootDirName);
 
         for (; fCtx->currFileIdx < fCtx->nbFilesTotal; fCtx->currFileIdx++) {   /* create dstFileName */
             const char* const srcFileName = srcNamesTable[fCtx->currFileIdx];
@@ -2769,9 +2909,9 @@ FIO_decompressMultipleFilenames(FIO_ctx_t* const fCtx,
             error |= status;
         }
         if (outDirName)
-            FIO_checkFilenameCollisions(srcNamesTable , fCtx->nbFilesTotal);
+            FIO_checkFilenameCollisions(srcNamesTable , (unsigned)fCtx->nbFilesTotal);
     }
-    
+
     if (fCtx->nbFilesProcessed >= 1  && fCtx->nbFilesTotal > 1 && fCtx->totalBytesOutput != 0)
         DISPLAYLEVEL(2, "%d files decompressed : %6zu bytes total \n", fCtx->nbFilesProcessed, fCtx->totalBytesOutput);
 
@@ -2905,7 +3045,7 @@ static InfoError
 getFileInfo_fileConfirmed(fileInfo_t* info, const char* inFileName)
 {
     InfoError status;
-    FILE* const srcFile = FIO_openSrcFile(inFileName);
+    FILE* const srcFile = FIO_openSrcFile(NULL, inFileName);
     ERROR_IF(srcFile == NULL, info_file_error, "Error: could not open source file %s", inFileName);
 
     info->compressedSize = UTIL_getFileSize(inFileName);
@@ -2933,25 +3073,24 @@ getFileInfo(fileInfo_t* info, const char* srcFileName)
 static void
 displayInfo(const char* inFileName, const fileInfo_t* info, int displayLevel)
 {
-    unsigned const unit = info->compressedSize < (1 MB) ? (1 KB) : (1 MB);
-    const char* const unitStr = info->compressedSize < (1 MB) ? "KB" : "MB";
-    double const windowSizeUnit = (double)info->windowSize / unit;
-    double const compressedSizeUnit = (double)info->compressedSize / unit;
-    double const decompressedSizeUnit = (double)info->decompressedSize / unit;
-    double const ratio = (info->compressedSize == 0) ? 0 : ((double)info->decompressedSize)/info->compressedSize;
+    UTIL_HumanReadableSize_t const window_hrs = UTIL_makeHumanReadableSize(info->windowSize);
+    UTIL_HumanReadableSize_t const compressed_hrs = UTIL_makeHumanReadableSize(info->compressedSize);
+    UTIL_HumanReadableSize_t const decompressed_hrs = UTIL_makeHumanReadableSize(info->decompressedSize);
+    double const ratio = (info->compressedSize == 0) ? 0 : ((double)info->decompressedSize)/(double)info->compressedSize;
     const char* const checkString = (info->usesCheck ? "XXH64" : "None");
     if (displayLevel <= 2) {
         if (!info->decompUnavailable) {
-            DISPLAYOUT("%6d  %5d  %7.2f %2s  %9.2f %2s  %5.3f  %5s  %s\n",
+            DISPLAYOUT("%6d  %5d  %6.*f%4s  %8.*f%4s  %5.3f  %5s  %s\n",
                     info->numSkippableFrames + info->numActualFrames,
                     info->numSkippableFrames,
-                    compressedSizeUnit, unitStr, decompressedSizeUnit, unitStr,
+                    compressed_hrs.precision, compressed_hrs.value, compressed_hrs.suffix,
+                    decompressed_hrs.precision, decompressed_hrs.value, decompressed_hrs.suffix,
                     ratio, checkString, inFileName);
         } else {
-            DISPLAYOUT("%6d  %5d  %7.2f %2s                       %5s  %s\n",
+            DISPLAYOUT("%6d  %5d  %6.*f%4s                       %5s  %s\n",
                     info->numSkippableFrames + info->numActualFrames,
                     info->numSkippableFrames,
-                    compressedSizeUnit, unitStr,
+                    compressed_hrs.precision, compressed_hrs.value, compressed_hrs.suffix,
                     checkString, inFileName);
         }
     } else {
@@ -2959,15 +3098,15 @@ displayInfo(const char* inFileName, const fileInfo_t* info, int displayLevel)
         DISPLAYOUT("# Zstandard Frames: %d\n", info->numActualFrames);
         if (info->numSkippableFrames)
             DISPLAYOUT("# Skippable Frames: %d\n", info->numSkippableFrames);
-        DISPLAYOUT("Window Size: %.2f %2s (%llu B)\n",
-                   windowSizeUnit, unitStr,
+        DISPLAYOUT("Window Size: %.*f%s (%llu B)\n",
+                   window_hrs.precision, window_hrs.value, window_hrs.suffix,
                    (unsigned long long)info->windowSize);
-        DISPLAYOUT("Compressed Size: %.2f %2s (%llu B)\n",
-                    compressedSizeUnit, unitStr,
+        DISPLAYOUT("Compressed Size: %.*f%s (%llu B)\n",
+                    compressed_hrs.precision, compressed_hrs.value, compressed_hrs.suffix,
                     (unsigned long long)info->compressedSize);
         if (!info->decompUnavailable) {
-            DISPLAYOUT("Decompressed Size: %.2f %2s (%llu B)\n",
-                    decompressedSizeUnit, unitStr,
+            DISPLAYOUT("Decompressed Size: %.*f%s (%llu B)\n",
+                    decompressed_hrs.precision, decompressed_hrs.value, decompressed_hrs.suffix,
                     (unsigned long long)info->decompressedSize);
             DISPLAYOUT("Ratio: %.4f\n", ratio);
         }
@@ -3055,24 +3194,23 @@ int FIO_listMultipleFiles(unsigned numFiles, const char** filenameTable, int dis
                 error |= FIO_listFile(&total, filenameTable[u], displayLevel);
         }   }
         if (numFiles > 1 && displayLevel <= 2) {   /* display total */
-            unsigned const unit = total.compressedSize < (1 MB) ? (1 KB) : (1 MB);
-            const char* const unitStr = total.compressedSize < (1 MB) ? "KB" : "MB";
-            double const compressedSizeUnit = (double)total.compressedSize / unit;
-            double const decompressedSizeUnit = (double)total.decompressedSize / unit;
-            double const ratio = (total.compressedSize == 0) ? 0 : ((double)total.decompressedSize)/total.compressedSize;
+            UTIL_HumanReadableSize_t const compressed_hrs = UTIL_makeHumanReadableSize(total.compressedSize);
+            UTIL_HumanReadableSize_t const decompressed_hrs = UTIL_makeHumanReadableSize(total.decompressedSize);
+            double const ratio = (total.compressedSize == 0) ? 0 : ((double)total.decompressedSize)/(double)total.compressedSize;
             const char* const checkString = (total.usesCheck ? "XXH64" : "");
             DISPLAYOUT("----------------------------------------------------------------- \n");
             if (total.decompUnavailable) {
-                DISPLAYOUT("%6d  %5d  %7.2f %2s                       %5s  %u files\n",
+                DISPLAYOUT("%6d  %5d  %6.*f%4s                       %5s  %u files\n",
                         total.numSkippableFrames + total.numActualFrames,
                         total.numSkippableFrames,
-                        compressedSizeUnit, unitStr,
+                        compressed_hrs.precision, compressed_hrs.value, compressed_hrs.suffix,
                         checkString, (unsigned)total.nbFiles);
             } else {
-                DISPLAYOUT("%6d  %5d  %7.2f %2s  %9.2f %2s  %5.3f  %5s  %u files\n",
+                DISPLAYOUT("%6d  %5d  %6.*f%4s  %8.*f%4s  %5.3f  %5s  %u files\n",
                         total.numSkippableFrames + total.numActualFrames,
                         total.numSkippableFrames,
-                        compressedSizeUnit, unitStr, decompressedSizeUnit, unitStr,
+                        compressed_hrs.precision, compressed_hrs.value, compressed_hrs.suffix,
+                        decompressed_hrs.precision, decompressed_hrs.value, decompressed_hrs.suffix,
                         ratio, checkString, (unsigned)total.nbFiles);
         }   }
         return error;
diff --git a/programs/fileio.h b/programs/fileio.h
index 05e6d06815f7..61094db83cba 100644
--- a/programs/fileio.h
+++ b/programs/fileio.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * Copyright (c) Yann Collet, Facebook, Inc.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -68,6 +68,8 @@ void FIO_freeContext(FIO_ctx_t* const fCtx);
 
 typedef struct FIO_display_prefs_s FIO_display_prefs_t;
 
+typedef enum { FIO_ps_auto, FIO_ps_never, FIO_ps_always } FIO_progressSetting_e;
+
 /*-*************************************
 *  Parameters
 ***************************************/
@@ -77,6 +79,7 @@ void FIO_overwriteMode(FIO_prefs_t* const prefs);
 void FIO_setAdaptiveMode(FIO_prefs_t* const prefs, unsigned adapt);
 void FIO_setAdaptMin(FIO_prefs_t* const prefs, int minCLevel);
 void FIO_setAdaptMax(FIO_prefs_t* const prefs, int maxCLevel);
+void FIO_setUseRowMatchFinder(FIO_prefs_t* const prefs, int useRowMatchFinder);
 void FIO_setBlockSize(FIO_prefs_t* const prefs, int blockSize);
 void FIO_setChecksumFlag(FIO_prefs_t* const prefs, int checksumFlag);
 void FIO_setDictIDFlag(FIO_prefs_t* const prefs, int dictIDFlag);
@@ -97,13 +100,15 @@ void FIO_setSrcSizeHint(FIO_prefs_t* const prefs, size_t srcSizeHint);
 void FIO_setTestMode(FIO_prefs_t* const prefs, int testMode);
 void FIO_setLiteralCompressionMode(
         FIO_prefs_t* const prefs,
-        ZSTD_literalCompressionMode_e mode);
+        ZSTD_paramSwitch_e mode);
 
-void FIO_setNoProgress(unsigned noProgress);
+void FIO_setProgressSetting(FIO_progressSetting_e progressSetting);
 void FIO_setNotificationLevel(int level);
 void FIO_setExcludeCompressedFile(FIO_prefs_t* const prefs, int excludeCompressedFiles);
+void FIO_setAllowBlockDevices(FIO_prefs_t* const prefs, int allowBlockDevices);
 void FIO_setPatchFromMode(FIO_prefs_t* const prefs, int value);
 void FIO_setContentSize(FIO_prefs_t* const prefs, int value);
+void FIO_displayCompressionParameters(const FIO_prefs_t* prefs);
 
 /* FIO_ctx_t functions */
 void FIO_setNbFilesTotal(FIO_ctx_t* const fCtx, int value);
diff --git a/programs/platform.h b/programs/platform.h
index 68be70bb333b..b858e3b484c2 100644
--- a/programs/platform.h
+++ b/programs/platform.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020, Przemyslaw Skibinski, Yann Collet, Facebook, Inc.
+ * Copyright (c) Przemyslaw Skibinski, Yann Collet, Facebook, Inc.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -22,6 +22,7 @@ extern "C" {
 ****************************************/
 #if defined(_MSC_VER)
 #  define _CRT_SECURE_NO_WARNINGS    /* Disable Visual Studio warning messages for fopen, strncpy, strerror */
+#  define _CRT_NONSTDC_NO_WARNINGS   /* Disable C4996 complaining about posix function names */
 #  if (_MSC_VER <= 1800)             /* 1800 == Visual Studio 2013 */
 #    define _CRT_SECURE_NO_DEPRECATE /* VS2005 - must be declared before <io.h> and <windows.h> */
 #    define snprintf sprintf_s       /* snprintf unsupported by Visual <= 2013 */
diff --git a/programs/timefn.c b/programs/timefn.c
index 95460d0d971d..64577b0e932a 100644
--- a/programs/timefn.c
+++ b/programs/timefn.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, Yann Collet, Facebook, Inc.
+ * Copyright (c) Yann Collet, Facebook, Inc.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
diff --git a/programs/timefn.h b/programs/timefn.h
index 5d2818e8a1b7..3fcd78a28ec1 100644
--- a/programs/timefn.h
+++ b/programs/timefn.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * Copyright (c) Yann Collet, Facebook, Inc.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
diff --git a/programs/util.c b/programs/util.c
index 5386d005c26c..d69b72a37ca6 100644
--- a/programs/util.c
+++ b/programs/util.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020, Przemyslaw Skibinski, Yann Collet, Facebook, Inc.
+ * Copyright (c) Przemyslaw Skibinski, Yann Collet, Facebook, Inc.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -159,6 +159,29 @@ int UTIL_chmod(char const* filename, const stat_t* statbuf, mode_t permissions)
     return chmod(filename, permissions);
 }
 
+/* set access and modification times */
+int UTIL_utime(const char* filename, const stat_t *statbuf)
+{
+    int ret;
+    /* We check that st_mtime is a macro here in order to give us confidence
+     * that struct stat has a struct timespec st_mtim member. We need this
+     * check because there are some platforms that claim to be POSIX 2008
+     * compliant but which do not have st_mtim... */
+#if (PLATFORM_POSIX_VERSION >= 200809L) && defined(st_mtime)
+    /* (atime, mtime) */
+    struct timespec timebuf[2] = { {0, UTIME_NOW} };
+    timebuf[1] = statbuf->st_mtim;
+    ret = utimensat(AT_FDCWD, filename, timebuf, 0);
+#else
+    struct utimbuf timebuf;
+    timebuf.actime = time(NULL);
+    timebuf.modtime = statbuf->st_mtime;
+    ret = utime(filename, &timebuf);
+#endif
+    errno = 0;
+    return ret;
+}
+
 int UTIL_setFileStat(const char *filename, const stat_t *statbuf)
 {
     int res = 0;
@@ -168,25 +191,7 @@ int UTIL_setFileStat(const char *filename, const stat_t *statbuf)
         return -1;
 
     /* set access and modification times */
-    /* We check that st_mtime is a macro here in order to give us confidence
-     * that struct stat has a struct timespec st_mtim member. We need this
-     * check because there are some platforms that claim to be POSIX 2008
-     * compliant but which do not have st_mtim... */
-#if (PLATFORM_POSIX_VERSION >= 200809L) && defined(st_mtime)
-    {
-        /* (atime, mtime) */
-        struct timespec timebuf[2] = { {0, UTIME_NOW} };
-        timebuf[1] = statbuf->st_mtim;
-        res += utimensat(AT_FDCWD, filename, timebuf, 0);
-    }
-#else
-    {
-        struct utimbuf timebuf;
-        timebuf.actime = time(NULL);
-        timebuf.modtime = statbuf->st_mtime;
-        res += utime(filename, &timebuf);
-    }
-#endif
+    res += UTIL_utime(filename, statbuf);
 
 #if !defined(_WIN32)
     res += chown(filename, statbuf->st_uid, statbuf->st_gid);  /* Copy ownership */
@@ -260,6 +265,17 @@ int UTIL_isFIFOStat(const stat_t* statbuf)
     return 0;
 }
 
+/* UTIL_isBlockDevStat : distinguish named pipes */
+int UTIL_isBlockDevStat(const stat_t* statbuf)
+{
+/* macro guards, as defined in : https://linux.die.net/man/2/lstat */
+#if PLATFORM_POSIX_VERSION >= 200112L
+    if (S_ISBLK(statbuf->st_mode)) return 1;
+#endif
+    (void)statbuf;
+    return 0;
+}
+
 int UTIL_isLink(const char* infilename)
 {
 /* macro guards, as defined in : https://linux.die.net/man/2/lstat */
@@ -292,6 +308,62 @@ U64 UTIL_getFileSizeStat(const stat_t* statbuf)
     return (U64)statbuf->st_size;
 }
 
+UTIL_HumanReadableSize_t UTIL_makeHumanReadableSize(U64 size)
+{
+    UTIL_HumanReadableSize_t hrs;
+
+    if (g_utilDisplayLevel > 3) {
+        /* In verbose mode, do not scale sizes down, except in the case of
+         * values that exceed the integral precision of a double. */
+        if (size >= (1ull << 53)) {
+            hrs.value = (double)size / (1ull << 20);
+            hrs.suffix = " MiB";
+            /* At worst, a double representation of a maximal size will be
+             * accurate to better than tens of kilobytes. */
+            hrs.precision = 2;
+        } else {
+            hrs.value = (double)size;
+            hrs.suffix = " B";
+            hrs.precision = 0;
+        }
+    } else {
+        /* In regular mode, scale sizes down and use suffixes. */
+        if (size >= (1ull << 60)) {
+            hrs.value = (double)size / (1ull << 60);
+            hrs.suffix = " EiB";
+        } else if (size >= (1ull << 50)) {
+            hrs.value = (double)size / (1ull << 50);
+            hrs.suffix = " PiB";
+        } else if (size >= (1ull << 40)) {
+            hrs.value = (double)size / (1ull << 40);
+            hrs.suffix = " TiB";
+        } else if (size >= (1ull << 30)) {
+            hrs.value = (double)size / (1ull << 30);
+            hrs.suffix = " GiB";
+        } else if (size >= (1ull << 20)) {
+            hrs.value = (double)size / (1ull << 20);
+            hrs.suffix = " MiB";
+        } else if (size >= (1ull << 10)) {
+            hrs.value = (double)size / (1ull << 10);
+            hrs.suffix = " KiB";
+        } else {
+            hrs.value = (double)size;
+            hrs.suffix = " B";
+        }
+
+        if (hrs.value >= 100 || (U64)hrs.value == size) {
+            hrs.precision = 0;
+        } else if (hrs.value >= 10) {
+            hrs.precision = 1;
+        } else if (hrs.value > 1) {
+            hrs.precision = 2;
+        } else {
+            hrs.precision = 3;
+        }
+    }
+
+    return hrs;
+}
 
 U64 UTIL_getTotalFileSize(const char* const * fileNamesTable, unsigned nbFiles)
 {
@@ -312,9 +384,7 @@ U64 UTIL_getTotalFileSize(const char* const * fileNamesTable, unsigned nbFiles)
 static size_t readLineFromFile(char* buf, size_t len, FILE* file)
 {
     assert(!feof(file));
-    /* Work around Cygwin problem when len == 1 it returns NULL. */
-    if (len <= 1) return 0;
-    CONTROL( fgets(buf, (int) len, file) );
+    if ( fgets(buf, (int) len, file) == NULL ) return 0;
     {   size_t linelen = strlen(buf);
         if (strlen(buf)==0) return 0;
         if (buf[linelen-1] == '\n') linelen--;
@@ -670,7 +740,27 @@ const char* UTIL_getFileExtension(const char* infilename)
 
 static int pathnameHas2Dots(const char *pathname)
 {
-    return NULL != strstr(pathname, "..");
+    /* We need to figure out whether any ".." present in the path is a whole
+     * path token, which is the case if it is bordered on both sides by either
+     * the beginning/end of the path or by a directory separator.
+     */
+    const char *needle = pathname;
+    while (1) {
+        needle = strstr(needle, "..");
+
+        if (needle == NULL) {
+            return 0;
+        }
+
+        if ((needle == pathname || needle[-1] == PATH_SEP)
+         && (needle[2] == '\0' || needle[2] == PATH_SEP)) {
+            return 1;
+        }
+
+        /* increment so we search for the next match */
+        needle++;
+    };
+    return 0;
 }
 
 static int isFileNameValidForMirroredOutput(const char *filename)
@@ -902,7 +992,7 @@ makeUniqueMirroredDestDirs(char** srcDirNames, unsigned nbFile, const char* outD
         char* prevDirName = srcDirNames[i - 1];
         char* currDirName = srcDirNames[i];
 
-        /* note: we alwasy compare trimmed path, i.e.:
+        /* note: we always compare trimmed path, i.e.:
          * src dir of "./foo" and "/foo" will be both saved into:
          * "outDirName/foo/" */
         if (!firstIsParentOrSameDirOfSecond(trimPath(prevDirName),
@@ -910,7 +1000,7 @@ makeUniqueMirroredDestDirs(char** srcDirNames, unsigned nbFile, const char* outD
             uniqueDirNr++;
 
         /* we need maintain original src dir name instead of trimmed
-         * dir, so we can retrive the original src dir's mode_t */
+         * dir, so we can retrieve the original src dir's mode_t */
         uniqueDirNames[uniqueDirNr - 1] = currDirName;
     }
 
@@ -954,7 +1044,7 @@ void UTIL_mirrorSourceFilesDirectories(const char** inFileNames, unsigned int nb
 }
 
 FileNamesTable*
-UTIL_createExpandedFNT(const char** inputNames, size_t nbIfns, int followLinks)
+UTIL_createExpandedFNT(const char* const* inputNames, size_t nbIfns, int followLinks)
 {
     unsigned nbFiles;
     char* buf = (char*)malloc(LIST_SIZE_INCREASE);
@@ -1019,7 +1109,7 @@ FileNamesTable* UTIL_createFNT_fromROTable(const char** filenames, size_t nbFile
 
 
 /*-****************************************
-*  count the number of physical cores
+*  count the number of cores
 ******************************************/
 
 #if defined(_WIN32) || defined(WIN32)
@@ -1028,10 +1118,26 @@ FileNamesTable* UTIL_createFNT_fromROTable(const char** filenames, size_t nbFile
 
 typedef BOOL(WINAPI* LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD);
 
-int UTIL_countPhysicalCores(void)
+DWORD CountSetBits(ULONG_PTR bitMask)
+{
+    DWORD LSHIFT = sizeof(ULONG_PTR)*8 - 1;
+    DWORD bitSetCount = 0;
+    ULONG_PTR bitTest = (ULONG_PTR)1 << LSHIFT;
+    DWORD i;
+
+    for (i = 0; i <= LSHIFT; ++i)
+    {
+        bitSetCount += ((bitMask & bitTest)?1:0);
+        bitTest/=2;
+    }
+
+    return bitSetCount;
+}
+
+int UTIL_countCores(int logical)
 {
-    static int numPhysicalCores = 0;
-    if (numPhysicalCores != 0) return numPhysicalCores;
+    static int numCores = 0;
+    if (numCores != 0) return numCores;
 
     {   LPFN_GLPI glpi;
         BOOL done = FALSE;
@@ -1077,7 +1183,10 @@ int UTIL_countPhysicalCores(void)
         while (byteOffset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= returnLength) {
 
             if (ptr->Relationship == RelationProcessorCore) {
-                numPhysicalCores++;
+                if (logical)
+                    numCores += CountSetBits(ptr->ProcessorMask);
+                else
+                    numCores++;
             }
 
             ptr++;
@@ -1086,17 +1195,17 @@ int UTIL_countPhysicalCores(void)
 
         free(buffer);
 
-        return numPhysicalCores;
+        return numCores;
     }
 
 failed:
     /* try to fall back on GetSystemInfo */
     {   SYSTEM_INFO sysinfo;
         GetSystemInfo(&sysinfo);
-        numPhysicalCores = sysinfo.dwNumberOfProcessors;
-        if (numPhysicalCores == 0) numPhysicalCores = 1; /* just in case */
+        numCores = sysinfo.dwNumberOfProcessors;
+        if (numCores == 0) numCores = 1; /* just in case */
     }
-    return numPhysicalCores;
+    return numCores;
 }
 
 #elif defined(__APPLE__)
@@ -1105,24 +1214,24 @@ failed:
 
 /* Use apple-provided syscall
  * see: man 3 sysctl */
-int UTIL_countPhysicalCores(void)
+int UTIL_countCores(int logical)
 {
-    static S32 numPhysicalCores = 0; /* apple specifies int32_t */
-    if (numPhysicalCores != 0) return numPhysicalCores;
+    static S32 numCores = 0; /* apple specifies int32_t */
+    if (numCores != 0) return numCores;
 
     {   size_t size = sizeof(S32);
-        int const ret = sysctlbyname("hw.physicalcpu", &numPhysicalCores, &size, NULL, 0);
+        int const ret = sysctlbyname(logical ? "hw.logicalcpu" : "hw.physicalcpu", &numCores, &size, NULL, 0);
         if (ret != 0) {
             if (errno == ENOENT) {
                 /* entry not present, fall back on 1 */
-                numPhysicalCores = 1;
+                numCores = 1;
             } else {
-                perror("zstd: can't get number of physical cpus");
+                perror("zstd: can't get number of cpus");
                 exit(1);
             }
         }
 
-        return numPhysicalCores;
+        return numCores;
     }
 }
 
@@ -1131,16 +1240,16 @@ int UTIL_countPhysicalCores(void)
 /* parse /proc/cpuinfo
  * siblings / cpu cores should give hyperthreading ratio
  * otherwise fall back on sysconf */
-int UTIL_countPhysicalCores(void)
+int UTIL_countCores(int logical)
 {
-    static int numPhysicalCores = 0;
+    static int numCores = 0;
 
-    if (numPhysicalCores != 0) return numPhysicalCores;
+    if (numCores != 0) return numCores;
 
-    numPhysicalCores = (int)sysconf(_SC_NPROCESSORS_ONLN);
-    if (numPhysicalCores == -1) {
+    numCores = (int)sysconf(_SC_NPROCESSORS_ONLN);
+    if (numCores == -1) {
         /* value not queryable, fall back on 1 */
-        return numPhysicalCores = 1;
+        return numCores = 1;
     }
 
     /* try to determine if there's hyperthreading */
@@ -1154,7 +1263,7 @@ int UTIL_countPhysicalCores(void)
 
         if (cpuinfo == NULL) {
             /* fall back on the sysconf value */
-            return numPhysicalCores;
+            return numCores;
         }
 
         /* assume the cpu cores/siblings values will be constant across all
@@ -1183,12 +1292,17 @@ int UTIL_countPhysicalCores(void)
                 /* fall back on the sysconf value */
                 goto failed;
         }   }
-        if (siblings && cpu_cores) {
+        if (siblings && cpu_cores && siblings > cpu_cores) {
             ratio = siblings / cpu_cores;
         }
+
+        if (ratio && numCores > ratio && !logical) {
+            numCores = numCores / ratio;
+        }
+
 failed:
         fclose(cpuinfo);
-        return numPhysicalCores = numPhysicalCores / ratio;
+        return numCores;
     }
 }
 
@@ -1199,52 +1313,70 @@ failed:
 
 /* Use physical core sysctl when available
  * see: man 4 smp, man 3 sysctl */
-int UTIL_countPhysicalCores(void)
+int UTIL_countCores(int logical)
 {
-    static int numPhysicalCores = 0; /* freebsd sysctl is native int sized */
-    if (numPhysicalCores != 0) return numPhysicalCores;
+    static int numCores = 0; /* freebsd sysctl is native int sized */
+#if __FreeBSD_version >= 1300008
+    static int perCore = 1;
+#endif
+    if (numCores != 0) return numCores;
 
 #if __FreeBSD_version >= 1300008
-    {   size_t size = sizeof(numPhysicalCores);
-        int ret = sysctlbyname("kern.smp.cores", &numPhysicalCores, &size, NULL, 0);
-        if (ret == 0) return numPhysicalCores;
+    {   size_t size = sizeof(numCores);
+        int ret = sysctlbyname("kern.smp.cores", &numCores, &size, NULL, 0);
+        if (ret == 0) {
+            if (logical) {
+                ret = sysctlbyname("kern.smp.threads_per_core", &perCore, &size, NULL, 0);
+                /* default to physical cores if logical cannot be read */
+                if (ret == 0)
+                    numCores *= perCore;
+            }
+
+            return numCores;
+        }
         if (errno != ENOENT) {
-            perror("zstd: can't get number of physical cpus");
+            perror("zstd: can't get number of cpus");
             exit(1);
         }
         /* sysctl not present, fall through to older sysconf method */
     }
+#else
+    /* suppress unused parameter warning */
+    (void) logical;
 #endif
 
-    numPhysicalCores = (int)sysconf(_SC_NPROCESSORS_ONLN);
-    if (numPhysicalCores == -1) {
+    numCores = (int)sysconf(_SC_NPROCESSORS_ONLN);
+    if (numCores == -1) {
         /* value not queryable, fall back on 1 */
-        numPhysicalCores = 1;
+        numCores = 1;
     }
-    return numPhysicalCores;
+    return numCores;
 }
 
 #elif defined(__NetBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__CYGWIN__)
 
 /* Use POSIX sysconf
  * see: man 3 sysconf */
-int UTIL_countPhysicalCores(void)
+int UTIL_countCores(int logical)
 {
-    static int numPhysicalCores = 0;
+    static int numCores = 0;
+
+    /* suppress unused parameter warning */
+    (void)logical;
 
-    if (numPhysicalCores != 0) return numPhysicalCores;
+    if (numCores != 0) return numCores;
 
-    numPhysicalCores = (int)sysconf(_SC_NPROCESSORS_ONLN);
-    if (numPhysicalCores == -1) {
+    numCores = (int)sysconf(_SC_NPROCESSORS_ONLN);
+    if (numCores == -1) {
         /* value not queryable, fall back on 1 */
-        return numPhysicalCores = 1;
+        return numCores = 1;
     }
-    return numPhysicalCores;
+    return numCores;
 }
 
 #else
 
-int UTIL_countPhysicalCores(void)
+int UTIL_countCores(int logical)
 {
     /* assume 1 */
     return 1;
@@ -1252,6 +1384,16 @@ int UTIL_countPhysicalCores(void)
 
 #endif
 
+int UTIL_countPhysicalCores(void)
+{
+    return UTIL_countCores(0);
+}
+
+int UTIL_countLogicalCores(void)
+{
+    return UTIL_countCores(1);
+}
+
 #if defined (__cplusplus)
 }
 #endif
diff --git a/programs/util.h b/programs/util.h
index 25fa3f53aab4..add165d57ce9 100644
--- a/programs/util.h
+++ b/programs/util.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020, Przemyslaw Skibinski, Yann Collet, Facebook, Inc.
+ * Copyright (c) Przemyslaw Skibinski, Yann Collet, Facebook, Inc.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -64,7 +64,7 @@ extern "C" {
 #    define SET_REALTIME_PRIORITY /* disabled */
 #  endif
 
-#else  /* unknown non-unix operating systen */
+#else  /* unknown non-unix operating system */
 #  define UTIL_sleep(s)          /* disabled */
 #  define UTIL_sleepMilli(milli) /* disabled */
 #  define SET_REALTIME_PRIORITY  /* disabled */
@@ -122,6 +122,7 @@ int UTIL_requireUserConfirmation(const char* prompt, const char* abortMsg, const
 #define STRDUP(s) strdup(s)
 #endif
 
+
 /**
  * Calls platform's equivalent of stat() on filename and writes info to statbuf.
  * Returns success (1) or failure (0).
@@ -135,6 +136,14 @@ int UTIL_stat(const char* filename, stat_t* statbuf);
  */
 int UTIL_setFileStat(const char* filename, const stat_t* statbuf);
 
+/**
+ * Set atime to now and mtime to the st_mtim in statbuf.
+ *
+ * Directly wraps utime() or utimensat(). Returns -1 on error.
+ * Does not validate filename is valid.
+ */
+int UTIL_utime(const char* filename, const stat_t *statbuf);
+
 /*
  * These helpers operate on a pre-populated stat_t, i.e., the result of
  * calling one of the above functions.
@@ -143,6 +152,7 @@ int UTIL_setFileStat(const char* filename, const stat_t* statbuf);
 int UTIL_isRegularFileStat(const stat_t* statbuf);
 int UTIL_isDirectoryStat(const stat_t* statbuf);
 int UTIL_isFIFOStat(const stat_t* statbuf);
+int UTIL_isBlockDevStat(const stat_t* statbuf);
 U64 UTIL_getFileSizeStat(const stat_t* statbuf);
 
 /**
@@ -169,6 +179,23 @@ int UTIL_isFIFO(const char* infilename);
 U64 UTIL_getFileSize(const char* infilename);
 U64 UTIL_getTotalFileSize(const char* const * fileNamesTable, unsigned nbFiles);
 
+/**
+ * Take @size in bytes,
+ * prepare the components to pretty-print it in a scaled way.
+ * The components in the returned struct should be passed in
+ * precision, value, suffix order to a "%.*f%s" format string.
+ * Output policy is sensible to @g_utilDisplayLevel,
+ * for verbose mode (@g_utilDisplayLevel >= 4),
+ * does not scale down.
+ */
+typedef struct {
+  double value;
+  int precision;
+  const char* suffix;
+} UTIL_HumanReadableSize_t;
+
+UTIL_HumanReadableSize_t UTIL_makeHumanReadableSize(U64 size);
+
 int UTIL_compareStr(const void *p1, const void *p2);
 const char* UTIL_getFileExtension(const char* infilename);
 void  UTIL_mirrorSourceFilesDirectories(const char** fileNamesTable, unsigned int nbFiles, const char *outDirName);
@@ -272,15 +299,21 @@ void UTIL_refFilename(FileNamesTable* fnt, const char* filename);
  *        or NULL in case of error
  */
 FileNamesTable*
-UTIL_createExpandedFNT(const char** filenames, size_t nbFilenames, int followLinks);
+UTIL_createExpandedFNT(const char* const* filenames, size_t nbFilenames, int followLinks);
 
+#if defined(_WIN32) || defined(WIN32)
+DWORD CountSetBits(ULONG_PTR bitMask);
+#endif
 
 /*-****************************************
  *  System
  ******************************************/
 
+int UTIL_countCores(int logical);
+
 int UTIL_countPhysicalCores(void);
 
+int UTIL_countLogicalCores(void);
 
 #if defined (__cplusplus)
 }
diff --git a/programs/zstd.1 b/programs/zstd.1
index a8fc277672b6..c7a19dbacace 100644
--- a/programs/zstd.1
+++ b/programs/zstd.1
@@ -1,5 +1,5 @@
 .
-.TH "ZSTD" "1" "December 2020" "zstd 1.4.8" "User Commands"
+.TH "ZSTD" "1" "January 2022" "zstd 1.5.2" "User Commands"
 .
 .SH "NAME"
 \fBzstd\fR \- zstd, zstdmt, unzstd, zstdcat \- Compress or decompress \.zst files
@@ -105,12 +105,15 @@ Display information related to a zstd compressed file, such as size, ratio, and
 \fB\-\-fast[=#]\fR: switch to ultra\-fast compression levels\. If \fB=#\fR is not present, it defaults to \fB1\fR\. The higher the value, the faster the compression speed, at the cost of some compression ratio\. This setting overwrites compression level if one was set previously\. Similarly, if a compression level is set after \fB\-\-fast\fR, it overrides it\.
 .
 .IP "\(bu" 4
-\fB\-T#\fR, \fB\-\-threads=#\fR: Compress using \fB#\fR working threads (default: 1)\. If \fB#\fR is 0, attempt to detect and use the number of physical CPU cores\. In all cases, the nb of threads is capped to ZSTDMT_NBWORKERS_MAX==200\. This modifier does nothing if \fBzstd\fR is compiled without multithread support\.
+\fB\-T#\fR, \fB\-\-threads=#\fR: Compress using \fB#\fR working threads (default: 1)\. If \fB#\fR is 0, attempt to detect and use the number of physical CPU cores\. In all cases, the nb of threads is capped to \fBZSTDMT_NBWORKERS_MAX\fR, which is either 64 in 32\-bit mode, or 256 for 64\-bit environments\. This modifier does nothing if \fBzstd\fR is compiled without multithread support\.
 .
 .IP "\(bu" 4
 \fB\-\-single\-thread\fR: Does not spawn a thread for compression, use a single thread for both I/O and compression\. In this mode, compression is serialized with I/O, which is slightly slower\. (This is different from \fB\-T1\fR, which spawns 1 compression thread in parallel of I/O)\. This mode is the only one available when multithread support is disabled\. Single\-thread mode features lower memory usage\. Final compressed result is slightly different from \fB\-T1\fR\.
 .
 .IP "\(bu" 4
+\fB\-\-auto\-threads={physical,logical} (default: physical)\fR: When using a default amount of threads via \fB\-T0\fR, choose the default based on the number of detected physical or logical cores\.
+.
+.IP "\(bu" 4
 \fB\-\-adapt[=min=#,max=#]\fR : \fBzstd\fR will dynamically adapt compression level to perceived I/O conditions\. Compression level adaptation can be observed live by using command \fB\-v\fR\. Adaptation can be constrained between supplied \fBmin\fR and \fBmax\fR levels\. The feature works when combined with multi\-threading and \fB\-\-long\fR mode\. It does not work with \fB\-\-single\-thread\fR\. It sets window size to 8 MB by default (can be changed manually, see \fBwlog\fR)\. Due to the chaotic nature of dynamic adaptation, compressed result is not reproducible\. \fInote\fR : at the time of this writing, \fB\-\-adapt\fR can remain stuck at low speed when combined with multiple worker threads (>=2)\.
 .
 .IP "\(bu" 4
@@ -129,7 +132,7 @@ Note: If \fBwindowLog\fR is set to larger than 27, \fB\-\-long=windowLog\fR or \
 Note: cannot use both this and \-D together Note: \fB\-\-long\fR mode will be automatically activated if chainLog < fileLog (fileLog being the windowLog required to cover the whole file)\. You can also manually force it\. Node: for all levels, you can use \-\-patch\-from in \-\-single\-thread mode to improve compression ratio at the cost of speed Note: for level 19, you can get increased compression ratio at the cost of speed by specifying \fB\-\-zstd=targetLength=\fR to be something large (i\.e 4096), and by setting a large \fB\-\-zstd=chainLog=\fR
 .
 .IP "\(bu" 4
-\fB\-\-rsyncable\fR : \fBzstd\fR will periodically synchronize the compression state to make the compressed file more rsync\-friendly\. There is a negligible impact to compression ratio, and the faster compression levels will see a small compression speed hit\. This feature does not work with \fB\-\-single\-thread\fR\. You probably don\'t want to use it with long range mode, since it will decrease the effectiveness of the synchronization points, but your milage may vary\.
+\fB\-\-rsyncable\fR : \fBzstd\fR will periodically synchronize the compression state to make the compressed file more rsync\-friendly\. There is a negligible impact to compression ratio, and the faster compression levels will see a small compression speed hit\. This feature does not work with \fB\-\-single\-thread\fR\. You probably don\'t want to use it with long range mode, since it will decrease the effectiveness of the synchronization points, but your mileage may vary\.
 .
 .IP "\(bu" 4
 \fB\-C\fR, \fB\-\-[no\-]check\fR: add integrity check computed from uncompressed data (default: enabled)
@@ -146,6 +149,9 @@ Note: cannot use both this and \-D together Note: \fB\-\-long\fR mode will be au
 .IP
 This is also used during compression when using with \-\-patch\-from=\. In this case, this parameter overrides that maximum size allowed for a dictionary\. (128 MB)\.
 .
+.IP
+Additionally, this can be used to limit memory for dictionary training\. This parameter overrides the default limit of 2 GB\. zstd will load training samples up to the memory limit and ignore the rest\.
+.
 .IP "\(bu" 4
 \fB\-\-stream\-size=#\fR : Sets the pledged source size of input coming from a stream\. This value must be exact, as it will be included in the produced frame header\. Incorrect stream sizes will cause an error\. This information will be used to better optimize compression parameters, resulting in better and potentially faster compression, especially for smaller source sizes\.
 .
@@ -156,10 +162,10 @@ This is also used during compression when using with \-\-patch\-from=\. In this
 \fB\-o FILE\fR: save result into \fBFILE\fR
 .
 .IP "\(bu" 4
-\fB\-f\fR, \fB\-\-force\fR: overwrite output without prompting, and (de)compress symbolic links
+\fB\-f\fR, \fB\-\-force\fR: disable input and output checks\. Allows overwriting existing files, input from console, output to stdout, operating on links, block devices, etc\.
 .
 .IP "\(bu" 4
-\fB\-c\fR, \fB\-\-stdout\fR: force write to standard output, even if it is the console
+\fB\-c\fR, \fB\-\-stdout\fR: write to standard output (even if it is the console)
 .
 .IP "\(bu" 4
 \fB\-\-[no\-]sparse\fR: enable / disable sparse FS support, to make files with many zeroes smaller on disk\. Creating sparse files may save disk space and speed up decompression by reducing the amount of disk I/O\. default: enabled when output is into a file, and disabled when output is stdout\. This setting overrides default and can force sparse mode over stdout\.
@@ -171,7 +177,7 @@ This is also used during compression when using with \-\-patch\-from=\. In this
 \fB\-k\fR, \fB\-\-keep\fR: keep source file(s) after successful compression or decompression\. This is the default behavior\.
 .
 .IP "\(bu" 4
-\fB\-r\fR: operate recursively on directories
+\fB\-r\fR: operate recursively on directories\. It selects all files in the named directory and all its subdirectories\. This can be useful both to reduce command line typing, and to circumvent shell expansion limitations, when there are a lot of files and naming breaks the maximum size of a command line\.
 .
 .IP "\(bu" 4
 \fB\-\-filelist FILE\fR read a list of files to process as content from \fBFILE\fR\. Format is compatible with \fBls\fR output, with one file per line\.
@@ -218,7 +224,7 @@ Using environment variables to set parameters has security implications\. Theref
 \fBZSTD_CLEVEL\fR can be used to set the level between 1 and 19 (the "normal" range)\. If the value of \fBZSTD_CLEVEL\fR is not a valid integer, it will be ignored with a warning message\. \fBZSTD_CLEVEL\fR just replaces the default compression level (\fB3\fR)\.
 .
 .P
-\fBZSTD_NBTHREADS\fR can be used to set the number of threads \fBzstd\fR will attempt to use during compression\. If the value of \fBZSTD_NBTHREADS\fR is not a valid unsigned integer, it will be ignored with a warning message\. \'ZSTD_NBTHREADS\fBhas a default value of (\fR1\fB), and is capped at ZSTDMT_NBWORKERS_MAX==200\.\fRzstd` must be compiled with multithread support for this to have any effect\.
+\fBZSTD_NBTHREADS\fR can be used to set the number of threads \fBzstd\fR will attempt to use during compression\. If the value of \fBZSTD_NBTHREADS\fR is not a valid unsigned integer, it will be ignored with a warning message\. \fBZSTD_NBTHREADS\fR has a default value of (\fB1\fR), and is capped at ZSTDMT_NBWORKERS_MAX==200\. \fBzstd\fR must be compiled with multithread support for this to have any effect\.
 .
 .P
 They can both be overridden by corresponding command line arguments: \fB\-#\fR for compression level and \fB\-T#\fR for number of compression threads\.
@@ -228,10 +234,10 @@ They can both be overridden by corresponding command line arguments: \fB\-#\fR f
 .
 .TP
 \fB\-\-train FILEs\fR
-Use FILEs as training set to create a dictionary\. The training set should contain a lot of small files (> 100), and weight typically 100x the target dictionary size (for example, 10 MB for a 100 KB dictionary)\.
+Use FILEs as training set to create a dictionary\. The training set should contain a lot of small files (> 100), and weight typically 100x the target dictionary size (for example, 10 MB for a 100 KB dictionary)\. \fB\-\-train\fR can be combined with \fB\-r\fR to indicate a directory rather than listing all the files, which can be useful to circumvent shell expansion limits\.
 .
 .IP
-Supports multithreading if \fBzstd\fR is compiled with threading support\. Additional parameters can be specified with \fB\-\-train\-fastcover\fR\. The legacy dictionary builder can be accessed with \fB\-\-train\-legacy\fR\. The cover dictionary builder can be accessed with \fB\-\-train\-cover\fR\. Equivalent to \fB\-\-train\-fastcover=d=8,steps=4\fR\.
+\fB\-\-train\fR supports multithreading if \fBzstd\fR is compiled with threading support (default)\. Additional parameters can be specified with \fB\-\-train\-fastcover\fR\. The legacy dictionary builder can be accessed with \fB\-\-train\-legacy\fR\. The slower cover dictionary builder can be accessed with \fB\-\-train\-cover\fR\. Default is equivalent to \fB\-\-train\-fastcover=d=8,steps=4\fR\.
 .
 .TP
 \fB\-o file\fR
@@ -247,7 +253,11 @@ Use \fB#\fR compression level during training (optional)\. Will generate statist
 .
 .TP
 \fB\-B#\fR
-Split input files in blocks of size # (default: no split)
+Split input files into blocks of size # (default: no split)
+.
+.TP
+\fB\-M#\fR, \fB\-\-memory=#\fR
+Limit the amount of sample data loaded for training (default: 2 GB)\. See above for details\.
 .
 .TP
 \fB\-\-dictID=#\fR
@@ -343,6 +353,9 @@ set process priority to real\-time
 .
 .SH "ADVANCED COMPRESSION OPTIONS"
 .
+.SS "\-B#:"
+Select the size of each compression job\. This parameter is only available when multi\-threading is enabled\. Each compression job is run in parallel, so this value indirectly impacts the nb of active threads\. Default job size varies depending on compression level (generally \fB4 * windowSize\fR)\. \fB\-B#\fR makes it possible to manually select a custom size\. Note that job size must respect a minimum value which is enforced transparently\. This minimum is either 512 KB, or \fBoverlapSize\fR, whichever is largest\. Different job sizes will lead to (slightly) different compressed frames\.
+.
 .SS "\-\-zstd[=options]:"
 \fBzstd\fR provides 22 predefined compression levels\. The selected or default predefined compression level can be changed with advanced compression options\. The \fIoptions\fR are provided as a comma\-separated list\. You may specify only the options you want to change and the rest will be taken from the selected or default compression level\. The list of available \fIoptions\fR:
 .
@@ -481,9 +494,6 @@ The following parameters sets advanced compression options to something similar
 .P
 \fB\-\-zstd\fR=wlog=23,clog=23,hlog=22,slog=6,mml=3,tlen=48,strat=6
 .
-.SS "\-B#:"
-Select the size of each compression job\. This parameter is available only when multi\-threading is enabled\. Default value is \fB4 * windowSize\fR, which means it varies depending on compression level\. \fB\-B#\fR makes it possible to select a custom value\. Note that job size must respect a minimum value which is enforced transparently\. This minimum is either 1 MB, or \fBoverlapSize\fR, whichever is largest\.
-.
 .SH "BUGS"
 Report bugs at: https://github\.com/facebook/zstd/issues
 .
diff --git a/programs/zstd.1.md b/programs/zstd.1.md
index 73670daf6dc2..e343ec0448b4 100644
--- a/programs/zstd.1.md
+++ b/programs/zstd.1.md
@@ -115,7 +115,8 @@ the last one takes effect.
 * `-T#`, `--threads=#`:
     Compress using `#` working threads (default: 1).
     If `#` is 0, attempt to detect and use the number of physical CPU cores.
-    In all cases, the nb of threads is capped to ZSTDMT_NBWORKERS_MAX==200.
+    In all cases, the nb of threads is capped to `ZSTDMT_NBWORKERS_MAX`,
+    which is either 64 in 32-bit mode, or 256 for 64-bit environments.
     This modifier does nothing if `zstd` is compiled without multithread support.
 * `--single-thread`:
     Does not spawn a thread for compression, use a single thread for both I/O and compression.
@@ -124,6 +125,9 @@ the last one takes effect.
     This mode is the only one available when multithread support is disabled.
     Single-thread mode features lower memory usage.
     Final compressed result is slightly different from `-T1`.
+* `--auto-threads={physical,logical} (default: physical)`:
+    When using a default amount of threads via `-T0`, choose the default based on the number
+    of detected physical or logical cores.
 * `--adapt[=min=#,max=#]` :
     `zstd` will dynamically adapt compression level to perceived I/O conditions.
     Compression level adaptation can be observed live by using command `-v`.
@@ -167,7 +171,7 @@ the last one takes effect.
     compression speed hit.
     This feature does not work with `--single-thread`. You probably don't want
     to use it with long range mode, since it will decrease the effectiveness of
-    the synchronization points, but your milage may vary.
+    the synchronization points, but your mileage may vary.
 * `-C`, `--[no-]check`:
     add integrity check computed from uncompressed data (default: enabled)
 * `--[no-]content-size`:
@@ -186,6 +190,10 @@ the last one takes effect.
 
     This is also used during compression when using with --patch-from=. In this case,
     this parameter overrides that maximum size allowed for a dictionary. (128 MB).
+
+    Additionally, this can be used to limit memory for dictionary training. This parameter
+    overrides the default limit of 2 GB. zstd will load training samples up to the memory limit
+    and ignore the rest.
 * `--stream-size=#` :
     Sets the pledged source size of input coming from a stream. This value must be exact, as it
     will be included in the produced frame header. Incorrect stream sizes will cause an error.
@@ -201,9 +209,10 @@ the last one takes effect.
 * `-o FILE`:
     save result into `FILE`
 * `-f`, `--force`:
-    overwrite output without prompting, and (de)compress symbolic links
+    disable input and output checks. Allows overwriting existing files, input
+    from console, output to stdout, operating on links, block devices, etc.
 * `-c`, `--stdout`:
-    force write to standard output, even if it is the console
+    write to standard output (even if it is the console)
 * `--[no-]sparse`:
     enable / disable sparse FS support,
     to make files with many zeroes smaller on disk.
@@ -214,12 +223,16 @@ the last one takes effect.
     This setting overrides default and can force sparse mode over stdout.
 * `--rm`:
     remove source file(s) after successful compression or decompression. If used in combination with
-    -o, will trigger a confirmation prompt (which can be silenced with -f), as this is a destructive operation. 
+    -o, will trigger a confirmation prompt (which can be silenced with -f), as this is a destructive operation.
 * `-k`, `--keep`:
     keep source file(s) after successful compression or decompression.
     This is the default behavior.
 * `-r`:
-    operate recursively on directories
+    operate recursively on directories.
+    It selects all files in the named directory and all its subdirectories.
+    This can be useful both to reduce command line typing,
+    and to circumvent shell expansion limitations,
+    when there are a lot of files and naming breaks the maximum size of a command line.
 * `--filelist FILE`
     read a list of files to process as content from `FILE`.
     Format is compatible with `ls` output, with one file per line.
@@ -280,11 +293,11 @@ If the value of `ZSTD_CLEVEL` is not a valid integer, it will be ignored with a
 
 `ZSTD_NBTHREADS` can be used to set the number of threads `zstd` will attempt to use during compression.
 If the value of `ZSTD_NBTHREADS` is not a valid unsigned integer, it will be ignored with a warning message.
-'ZSTD_NBTHREADS` has a default value of (`1`), and is capped at ZSTDMT_NBWORKERS_MAX==200. `zstd` must be
+`ZSTD_NBTHREADS` has a default value of (`1`), and is capped at ZSTDMT_NBWORKERS_MAX==200. `zstd` must be
 compiled with multithread support for this to have any effect.
 
 They can both be overridden by corresponding command line arguments:
-`-#` for compression level and `-T#` for number of compression threads. 
+`-#` for compression level and `-T#` for number of compression threads.
 
 
 DICTIONARY BUILDER
@@ -302,12 +315,14 @@ Compression of small files similar to the sample set will be greatly improved.
     The training set should contain a lot of small files (> 100),
     and weight typically 100x the target dictionary size
     (for example, 10 MB for a 100 KB dictionary).
+    `--train` can be combined with `-r` to indicate a directory rather than listing all the files,
+    which can be useful to circumvent shell expansion limits.
 
-    Supports multithreading if `zstd` is compiled with threading support.
+    `--train` supports multithreading if `zstd` is compiled with threading support (default).
     Additional parameters can be specified with `--train-fastcover`.
     The legacy dictionary builder can be accessed with `--train-legacy`.
-    The cover dictionary builder can be accessed with `--train-cover`.
-    Equivalent to `--train-fastcover=d=8,steps=4`.
+    The slower cover dictionary builder can be accessed with `--train-cover`.
+    Default is equivalent to `--train-fastcover=d=8,steps=4`.
 * `-o file`:
     Dictionary saved into `file` (default name: dictionary).
 * `--maxdict=#`:
@@ -317,10 +332,12 @@ Compression of small files similar to the sample set will be greatly improved.
     Will generate statistics more tuned for selected compression level,
     resulting in a _small_ compression ratio improvement for this level.
 * `-B#`:
-    Split input files in blocks of size # (default: no split)
+    Split input files into blocks of size # (default: no split)
+* `-M#`, `--memory=#`:
+    Limit the amount of sample data loaded for training (default: 2 GB). See above for details.
 * `--dictID=#`:
-    A dictionary ID is a locally unique ID that a decoder can use to verify it is
-    using the right dictionary.
+    A dictionary ID is a locally unique ID
+    that a decoder can use to verify it is using the right dictionary.
     By default, zstd will create a 4-bytes random number ID.
     It's possible to give a precise number instead.
     Short numbers have an advantage : an ID < 256 will only need 1 byte in the
@@ -422,6 +439,16 @@ BENCHMARK
 
 ADVANCED COMPRESSION OPTIONS
 ----------------------------
+### -B#:
+Select the size of each compression job.
+This parameter is only available when multi-threading is enabled.
+Each compression job is run in parallel, so this value indirectly impacts the nb of active threads.
+Default job size varies depending on compression level (generally  `4 * windowSize`).
+`-B#` makes it possible to manually select a custom size.
+Note that job size must respect a minimum value which is enforced transparently.
+This minimum is either 512 KB, or `overlapSize`, whichever is largest.
+Different job sizes will lead to (slightly) different compressed frames.
+
 ### --zstd[=options]:
 `zstd` provides 22 predefined compression levels.
 The selected or default predefined compression level can be changed with
@@ -565,13 +592,6 @@ similar to predefined level 19 for files bigger than 256 KB:
 
 `--zstd`=wlog=23,clog=23,hlog=22,slog=6,mml=3,tlen=48,strat=6
 
-### -B#:
-Select the size of each compression job.
-This parameter is available only when multi-threading is enabled.
-Default value is `4 * windowSize`, which means it varies depending on compression level.
-`-B#` makes it possible to select a custom value.
-Note that job size must respect a minimum value which is enforced transparently.
-This minimum is either 1 MB, or `overlapSize`, whichever is largest.
 
 BUGS
 ----
diff --git a/programs/zstdcli.c b/programs/zstdcli.c
index 9b6f91533462..bfe18c0c1ba3 100644
--- a/programs/zstdcli.c
+++ b/programs/zstdcli.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * Copyright (c) Yann Collet, Facebook, Inc.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -42,6 +42,9 @@
 #ifndef ZSTD_NODICT
 #  include "dibio.h"  /* ZDICT_cover_params_t, DiB_trainFromFiles() */
 #endif
+#ifndef ZSTD_NOTRACE
+#  include "zstdcli_trace.h"
+#endif
 #include "../lib/zstd.h"  /* ZSTD_VERSION_STRING, ZSTD_minCLevel, ZSTD_maxCLevel */
 
 
@@ -104,6 +107,24 @@ static int g_displayLevel = DISPLAY_LEVEL_DEFAULT;   /* 0 : no display,  1: erro
 
 
 /*-************************************
+*  Check Version (when CLI linked to dynamic library)
+**************************************/
+
+/* Due to usage of experimental symbols and capabilities by the CLI,
+ * the CLI must be linked against a dynamic library of same version */
+static void checkLibVersion(void)
+{
+    if (strcmp(ZSTD_VERSION_STRING, ZSTD_versionString())) {
+        DISPLAYLEVEL(1, "Error : incorrect library version (expecting : %s ; actual : %s ) \n",
+                    ZSTD_VERSION_STRING, ZSTD_versionString());
+        DISPLAYLEVEL(1, "Please update library to version %s, or use stand-alone zstd binary \n",
+                    ZSTD_VERSION_STRING);
+        exit(1);
+    }
+}
+
+
+/*-************************************
 *  Command Line
 **************************************/
 /* print help either in `stderr` or `stdout` depending on originating request
@@ -126,7 +147,9 @@ static void usage(FILE* f, const char* programName)
 #endif
     DISPLAY_F(f, " -D DICT: use DICT as Dictionary for compression or decompression \n");
     DISPLAY_F(f, " -o file: result stored into `file` (only 1 output file) \n");
-    DISPLAY_F(f, " -f     : overwrite output without prompting, also (de)compress links \n");
+    DISPLAY_F(f, " -f     : disable input and output checks. Allows overwriting existing files,\n");
+    DISPLAY_F(f, "          input from console, output to stdout, operating on links,\n");
+    DISPLAY_F(f, "          block devices, etc.\n");
     DISPLAY_F(f, "--rm    : remove source file(s) after successful de/compression \n");
     DISPLAY_F(f, " -k     : preserve source file(s) (default) \n");
     DISPLAY_F(f, " -h/-H  : display help/long help and exit \n");
@@ -140,11 +163,12 @@ static void usage_advanced(const char* programName)
     DISPLAYOUT( "Advanced arguments : \n");
     DISPLAYOUT( " -V     : display Version number and exit \n");
 
-    DISPLAYOUT( " -c     : force write to standard output, even if it is the console \n");
+    DISPLAYOUT( " -c     : write to standard output (even if it is the console) \n");
 
     DISPLAYOUT( " -v     : verbose mode; specify multiple times to increase verbosity \n");
     DISPLAYOUT( " -q     : suppress warnings; specify twice to suppress errors too \n");
-    DISPLAYOUT( "--no-progress : do not display the progress counter \n");
+    DISPLAYOUT( "--[no-]progress : forcibly display, or never display the progress counter.\n");
+    DISPLAYOUT( "                  note: any (de)compressed output to terminal will mix with progress counter text. \n");
 
 #ifdef UTIL_HAS_CREATEFILELIST
     DISPLAYOUT( " -r     : operate recursively on directories \n");
@@ -167,6 +191,11 @@ static void usage_advanced(const char* programName)
     DISPLAYOUT( "--[no-]check : during decompression, ignore/validate checksums in compressed frame (default: validate).");
 #endif
 #endif /* ZSTD_NOCOMPRESS */
+
+#ifndef ZSTD_NOTRACE
+    DISPLAYOUT( "\n");
+    DISPLAYOUT( "--trace FILE : log tracing information to FILE.");
+#endif
     DISPLAYOUT( "\n");
 
     DISPLAYOUT( "--      : All arguments after \"--\" are treated as files \n");
@@ -178,10 +207,13 @@ static void usage_advanced(const char* programName)
     DISPLAYOUT( "--long[=#]: enable long distance matching with given window log (default: %u) \n", g_defaultMaxWindowLog);
     DISPLAYOUT( "--fast[=#]: switch to very fast compression levels (default: %u) \n", 1);
     DISPLAYOUT( "--adapt : dynamically adapt compression level to I/O conditions \n");
+    DISPLAYOUT( "--[no-]row-match-finder : force enable/disable usage of fast row-based matchfinder for greedy, lazy, and lazy2 strategies \n");
+    DISPLAYOUT( "--patch-from=FILE : specify the file to be used as a reference point for zstd's diff engine. \n");
 # ifdef ZSTD_MULTITHREAD
     DISPLAYOUT( " -T#    : spawns # compression threads (default: 1, 0==# cores) \n");
     DISPLAYOUT( " -B#    : select size of each job (default: 0==automatic) \n");
     DISPLAYOUT( "--single-thread : use a single thread for both I/O and compression (result slightly different than -T1) \n");
+    DISPLAYOUT( "--auto-threads={physical,logical} (default: physical} : use either physical cores or logical cores as default when specifying -T0 \n");
     DISPLAYOUT( "--rsyncable : compress using a rsync-friendly method (-B sets block size) \n");
 # endif
     DISPLAYOUT( "--exclude-compressed: only compress files that are not already compressed \n");
@@ -323,6 +355,23 @@ static unsigned readU32FromChar(const char** stringPtr) {
     return result;
 }
 
+/*! readIntFromChar() :
+ * @return : signed integer value read from input in `char` format.
+ *  allows and interprets K, KB, KiB, M, MB and MiB suffix.
+ *  Will also modify `*stringPtr`, advancing it to position where it stopped reading.
+ *  Note : function will exit() program if digit sequence overflows */
+static int readIntFromChar(const char** stringPtr) {
+    static const char errorMsg[] = "error: numeric value overflows 32-bit int";
+    int sign = 1;
+    unsigned result;
+    if (**stringPtr=='-') {
+        (*stringPtr)++;
+        sign = -1;
+    }
+    if (readU32FromCharChecked(stringPtr, &result)) { errorOut(errorMsg); }
+    return (int) result * sign;
+}
+
 /*! readSizeTFromCharChecked() :
  * @return 0 if success, and store the result in *value.
  *  allows and interprets K, KB, KiB, M, MB and MiB suffix.
@@ -516,8 +565,8 @@ static ZDICT_fastCover_params_t defaultFastCoverParams(void)
 static unsigned parseAdaptParameters(const char* stringPtr, int* adaptMinPtr, int* adaptMaxPtr)
 {
     for ( ; ;) {
-        if (longCommandWArg(&stringPtr, "min=")) { *adaptMinPtr = (int)readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
-        if (longCommandWArg(&stringPtr, "max=")) { *adaptMaxPtr = (int)readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
+        if (longCommandWArg(&stringPtr, "min=")) { *adaptMinPtr = readIntFromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
+        if (longCommandWArg(&stringPtr, "max=")) { *adaptMaxPtr = readIntFromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
         DISPLAYLEVEL(4, "invalid compression parameter \n");
         return 0;
     }
@@ -598,6 +647,48 @@ static void printVersion(void)
     }   }
 }
 
+#define ZSTD_NB_STRATEGIES 9
+static const char* ZSTD_strategyMap[ZSTD_NB_STRATEGIES + 1] = { "", "ZSTD_fast",
+                "ZSTD_dfast", "ZSTD_greedy", "ZSTD_lazy", "ZSTD_lazy2", "ZSTD_btlazy2",
+                "ZSTD_btopt", "ZSTD_btultra", "ZSTD_btultra2"};
+
+#ifndef ZSTD_NOCOMPRESS
+
+static void printDefaultCParams(const char* filename, const char* dictFileName, int cLevel) {
+    unsigned long long fileSize = UTIL_getFileSize(filename);
+    const size_t dictSize = dictFileName != NULL ? (size_t)UTIL_getFileSize(dictFileName) : 0;
+    const ZSTD_compressionParameters cParams = ZSTD_getCParams(cLevel, fileSize, dictSize);
+    if (fileSize != UTIL_FILESIZE_UNKNOWN) DISPLAY("%s (%u bytes)\n", filename, (unsigned)fileSize);
+    else DISPLAY("%s (src size unknown)\n", filename);
+    DISPLAY(" - windowLog     : %u\n", cParams.windowLog);
+    DISPLAY(" - chainLog      : %u\n", cParams.chainLog);
+    DISPLAY(" - hashLog       : %u\n", cParams.hashLog);
+    DISPLAY(" - searchLog     : %u\n", cParams.searchLog);
+    DISPLAY(" - minMatch      : %u\n", cParams.minMatch);
+    DISPLAY(" - targetLength  : %u\n", cParams.targetLength);
+    assert(cParams.strategy < ZSTD_NB_STRATEGIES + 1);
+    DISPLAY(" - strategy      : %s (%u)\n", ZSTD_strategyMap[(int)cParams.strategy], (unsigned)cParams.strategy);
+}
+
+static void printActualCParams(const char* filename, const char* dictFileName, int cLevel, const ZSTD_compressionParameters* cParams) {
+    unsigned long long fileSize = UTIL_getFileSize(filename);
+    const size_t dictSize = dictFileName != NULL ? (size_t)UTIL_getFileSize(dictFileName) : 0;
+    ZSTD_compressionParameters actualCParams = ZSTD_getCParams(cLevel, fileSize, dictSize);
+    assert(g_displayLevel >= 4);
+    actualCParams.windowLog = cParams->windowLog == 0 ? actualCParams.windowLog : cParams->windowLog;
+    actualCParams.chainLog = cParams->chainLog == 0 ? actualCParams.chainLog : cParams->chainLog;
+    actualCParams.hashLog = cParams->hashLog == 0 ? actualCParams.hashLog : cParams->hashLog;
+    actualCParams.searchLog = cParams->searchLog == 0 ? actualCParams.searchLog : cParams->searchLog;
+    actualCParams.minMatch = cParams->minMatch == 0 ? actualCParams.minMatch : cParams->minMatch;
+    actualCParams.targetLength = cParams->targetLength == 0 ? actualCParams.targetLength : cParams->targetLength;
+    actualCParams.strategy = cParams->strategy == 0 ? actualCParams.strategy : cParams->strategy;
+    DISPLAY("--zstd=wlog=%d,clog=%d,hlog=%d,slog=%d,mml=%d,tlen=%d,strat=%d\n",
+            actualCParams.windowLog, actualCParams.chainLog, actualCParams.hashLog, actualCParams.searchLog,
+            actualCParams.minMatch, actualCParams.targetLength, actualCParams.strategy);
+}
+
+#endif
+
 /* Environment variables for parameter setting */
 #define ENV_CLEVEL "ZSTD_CLEVEL"
 #define ENV_NBTHREADS "ZSTD_NBTHREADS"    /* takes lower precedence than directly specifying -T# in the CLI */
@@ -674,11 +765,6 @@ static unsigned init_nbThreads(void) {
     val32 = readU32FromChar(&__nb); \
 }
 
-#define ZSTD_NB_STRATEGIES 9
-static const char* ZSTD_strategyMap[ZSTD_NB_STRATEGIES + 1] = { "", "ZSTD_fast",
-                "ZSTD_dfast", "ZSTD_greedy", "ZSTD_lazy", "ZSTD_lazy2", "ZSTD_btlazy2",
-                "ZSTD_btopt", "ZSTD_btultra", "ZSTD_btultra2"};
-
 typedef enum { zom_compress, zom_decompress, zom_test, zom_bench, zom_train, zom_list } zstd_operation_mode;
 
 #define CLEAN_RETURN(i) { operationResult = (i); goto _end; }
@@ -692,16 +778,19 @@ typedef enum { zom_compress, zom_decompress, zom_test, zom_bench, zom_train, zom
 # define MAXCLEVEL  ZSTD_maxCLevel()
 #endif
 
-int main(int const argCount, const char* argv[])
+int main(int argCount, const char* argv[])
 {
     int argNb,
         followLinks = 0,
+        allowBlockDevices = 0,
+        forceStdin = 0,
         forceStdout = 0,
         hasStdout = 0,
         ldmFlag = 0,
         main_pause = 0,
         nbWorkers = 0,
         adapt = 0,
+        useRowMatchFinder = 0,
         adaptMin = MINCLEVEL,
         adaptMax = MAXCLEVEL,
         rsyncable = 0,
@@ -710,6 +799,9 @@ int main(int const argCount, const char* argv[])
         separateFiles = 0,
         setRealTimePrio = 0,
         singleThread = 0,
+#ifdef ZSTD_MULTITHREAD
+        defaultLogicalCores = 0,
+#endif
         showDefaultCParams = 0,
         ultra=0,
         contentSize=1;
@@ -749,10 +841,11 @@ int main(int const argCount, const char* argv[])
 #ifndef ZSTD_NOBENCH
     BMK_advancedParams_t benchParams = BMK_initAdvancedParams();
 #endif
-    ZSTD_literalCompressionMode_e literalCompressionMode = ZSTD_lcm_auto;
+    ZSTD_paramSwitch_e literalCompressionMode = ZSTD_ps_auto;
 
 
     /* init */
+    checkLibVersion();
     (void)recursive; (void)cLevelLast;    /* not used when ZSTD_NOBENCH set */
     (void)memLimit;
     assert(argCount >= 1);
@@ -807,7 +900,7 @@ int main(int const argCount, const char* argv[])
                 if (!strcmp(argument, "--compress")) { operation=zom_compress; continue; }
                 if (!strcmp(argument, "--decompress")) { operation=zom_decompress; continue; }
                 if (!strcmp(argument, "--uncompress")) { operation=zom_decompress; continue; }
-                if (!strcmp(argument, "--force")) { FIO_overwriteMode(prefs); forceStdout=1; followLinks=1; continue; }
+                if (!strcmp(argument, "--force")) { FIO_overwriteMode(prefs); forceStdin=1; forceStdout=1; followLinks=1; allowBlockDevices=1; continue; }
                 if (!strcmp(argument, "--version")) { printVersion(); CLEAN_RETURN(0); }
                 if (!strcmp(argument, "--help")) { usage_advanced(programName); CLEAN_RETURN(0); }
                 if (!strcmp(argument, "--verbose")) { g_displayLevel++; continue; }
@@ -828,6 +921,8 @@ int main(int const argCount, const char* argv[])
                 if (!strcmp(argument, "--content-size")) { contentSize = 1; continue; }
                 if (!strcmp(argument, "--no-content-size")) { contentSize = 0; continue; }
                 if (!strcmp(argument, "--adapt")) { adapt = 1; continue; }
+                if (!strcmp(argument, "--no-row-match-finder")) { useRowMatchFinder = 1; continue; }
+                if (!strcmp(argument, "--row-match-finder")) { useRowMatchFinder = 2; continue; }
                 if (longCommandWArg(&argument, "--adapt=")) { adapt = 1; if (!parseAdaptParameters(argument, &adaptMin, &adaptMax)) { badusage(programName); CLEAN_RETURN(1); } continue; }
                 if (!strcmp(argument, "--single-thread")) { nbWorkers = 0; singleThread = 1; continue; }
                 if (!strcmp(argument, "--format=zstd")) { suffix = ZSTD_EXTENSION; FIO_setCompressionType(prefs, FIO_zstdCompression); continue; }
@@ -842,9 +937,10 @@ int main(int const argCount, const char* argv[])
                 if (!strcmp(argument, "--format=lz4")) { suffix = LZ4_EXTENSION; FIO_setCompressionType(prefs, FIO_lz4Compression);  continue; }
 #endif
                 if (!strcmp(argument, "--rsyncable")) { rsyncable = 1; continue; }
-                if (!strcmp(argument, "--compress-literals")) { literalCompressionMode = ZSTD_lcm_huffman; continue; }
-                if (!strcmp(argument, "--no-compress-literals")) { literalCompressionMode = ZSTD_lcm_uncompressed; continue; }
-                if (!strcmp(argument, "--no-progress")) { FIO_setNoProgress(1); continue; }
+                if (!strcmp(argument, "--compress-literals")) { literalCompressionMode = ZSTD_ps_enable; continue; }
+                if (!strcmp(argument, "--no-compress-literals")) { literalCompressionMode = ZSTD_ps_disable; continue; }
+                if (!strcmp(argument, "--no-progress")) { FIO_setProgressSetting(FIO_ps_never); continue; }
+                if (!strcmp(argument, "--progress")) { FIO_setProgressSetting(FIO_ps_always); continue; }
                 if (!strcmp(argument, "--exclude-compressed")) { FIO_setExcludeCompressedFile(prefs, 1); continue; }
 
                 /* long commands with arguments */
@@ -895,9 +991,21 @@ int main(int const argCount, const char* argv[])
                 if (longCommandWArg(&argument, "--target-compressed-block-size=")) { targetCBlockSize = readSizeTFromChar(&argument); continue; }
                 if (longCommandWArg(&argument, "--size-hint=")) { srcSizeHint = readSizeTFromChar(&argument); continue; }
                 if (longCommandWArg(&argument, "--output-dir-flat")) { NEXT_FIELD(outDirName); continue; }
+#ifdef ZSTD_MULTITHREAD
+                if (longCommandWArg(&argument, "--auto-threads")) {
+                    const char* threadDefault = NULL;
+                    NEXT_FIELD(threadDefault);
+                    if (strcmp(threadDefault, "logical") == 0)
+                        defaultLogicalCores = 1;
+                    continue;
+                }
+#endif
 #ifdef UTIL_HAS_MIRRORFILELIST
                 if (longCommandWArg(&argument, "--output-dir-mirror")) { NEXT_FIELD(outMirroredDirName); continue; }
 #endif
+#ifndef ZSTD_NOTRACE
+                if (longCommandWArg(&argument, "--trace")) { char const* traceFile; NEXT_FIELD(traceFile); TRACE_enable(traceFile); continue; }
+#endif
                 if (longCommandWArg(&argument, "--patch-from")) { NEXT_FIELD(patchFromDictFileName); continue; }
                 if (longCommandWArg(&argument, "--long")) {
                     unsigned ldmWindowLog = 0;
@@ -988,7 +1096,7 @@ int main(int const argCount, const char* argv[])
                 case 'D': argument++; NEXT_FIELD(dictFileName); break;
 
                     /* Overwrite */
-                case 'f': FIO_overwriteMode(prefs); forceStdout=1; followLinks=1; argument++; break;
+                case 'f': FIO_overwriteMode(prefs); forceStdin=1; forceStdout=1; followLinks=1; allowBlockDevices=1; argument++; break;
 
                     /* Verbose mode */
                 case 'v': g_displayLevel++; argument++; break;
@@ -1098,15 +1206,21 @@ int main(int const argCount, const char* argv[])
 #ifdef ZSTD_MULTITHREAD
     if ((nbWorkers==0) && (!singleThread)) {
         /* automatically set # workers based on # of reported cpus */
-        nbWorkers = UTIL_countPhysicalCores();
-        DISPLAYLEVEL(3, "Note: %d physical core(s) detected \n", nbWorkers);
+        if (defaultLogicalCores) {
+            nbWorkers = UTIL_countLogicalCores();
+            DISPLAYLEVEL(3, "Note: %d logical core(s) detected \n", nbWorkers);
+        } else {
+            nbWorkers = UTIL_countPhysicalCores();
+            DISPLAYLEVEL(3, "Note: %d physical core(s) detected \n", nbWorkers);
+        }
     }
 #else
     (void)singleThread; (void)nbWorkers;
 #endif
 
-#ifdef UTIL_HAS_CREATEFILELIST
     g_utilDisplayLevel = g_displayLevel;
+
+#ifdef UTIL_HAS_CREATEFILELIST
     if (!followLinks) {
         unsigned u, fileNamesNb;
         unsigned const nbFilenames = (unsigned)filenames->tableSize;
@@ -1164,6 +1278,7 @@ int main(int const argCount, const char* argv[])
         benchParams.ldmFlag = ldmFlag;
         benchParams.ldmMinMatch = (int)g_ldmMinMatch;
         benchParams.ldmHashLog = (int)g_ldmHashLog;
+        benchParams.useRowMatchFinder = useRowMatchFinder;
         if (g_ldmBucketSizeLog != LDM_PARAM_DEFAULT) {
             benchParams.ldmBucketSizeLog = (int)g_ldmBucketSizeLog;
         }
@@ -1212,18 +1327,18 @@ int main(int const argCount, const char* argv[])
             int const optimize = !coverParams.k || !coverParams.d;
             coverParams.nbThreads = (unsigned)nbWorkers;
             coverParams.zParams = zParams;
-            operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenames->fileNames, (unsigned)filenames->tableSize, blockSize, NULL, &coverParams, NULL, optimize);
+            operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenames->fileNames, (int)filenames->tableSize, blockSize, NULL, &coverParams, NULL, optimize, memLimit);
         } else if (dict == fastCover) {
             int const optimize = !fastCoverParams.k || !fastCoverParams.d;
             fastCoverParams.nbThreads = (unsigned)nbWorkers;
             fastCoverParams.zParams = zParams;
-            operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenames->fileNames, (unsigned)filenames->tableSize, blockSize, NULL, NULL, &fastCoverParams, optimize);
+            operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenames->fileNames, (int)filenames->tableSize, blockSize, NULL, NULL, &fastCoverParams, optimize, memLimit);
         } else {
             ZDICT_legacy_params_t dictParams;
             memset(&dictParams, 0, sizeof(dictParams));
             dictParams.selectivityLevel = dictSelect;
             dictParams.zParams = zParams;
-            operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenames->fileNames, (unsigned)filenames->tableSize, blockSize, &dictParams, NULL, NULL, 0);
+            operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenames->fileNames, (int)filenames->tableSize, blockSize, &dictParams, NULL, NULL, 0, memLimit);
         }
 #else
         (void)dictCLevel; (void)dictSelect; (void)dictID;  (void)maxDictSize; /* not used when ZSTD_NODICT set */
@@ -1243,7 +1358,9 @@ int main(int const argCount, const char* argv[])
         outFileName = stdoutmark;  /* when input is stdin, default output is stdout */
 
     /* Check if input/output defined as console; trigger an error in this case */
-    if (!strcmp(filenames->fileNames[0], stdinmark) && IS_CONSOLE(stdin) ) {
+    if (!forceStdin
+     && !strcmp(filenames->fileNames[0], stdinmark)
+     && IS_CONSOLE(stdin) ) {
         DISPLAYLEVEL(1, "stdin is a console, aborting\n");
         CLEAN_RETURN(1);
     }
@@ -1281,17 +1398,18 @@ int main(int const argCount, const char* argv[])
         DISPLAY("error : can't use --patch-from=# on multiple files \n");
         CLEAN_RETURN(1);
     }
-    
-    /* No status message in pipe mode (stdin - stdout) */	
+
+    /* No status message in pipe mode (stdin - stdout) */
     hasStdout = outFileName && !strcmp(outFileName,stdoutmark);
 
-    if (hasStdout && (g_displayLevel==2)) g_displayLevel=1;
+    if ((hasStdout || !IS_CONSOLE(stderr)) && (g_displayLevel==2)) g_displayLevel=1;
 
     /* IO Stream/File */
     FIO_setHasStdoutOutput(fCtx, hasStdout);
-    FIO_setNbFilesTotal(fCtx, (int)filenames->tableSize); 
+    FIO_setNbFilesTotal(fCtx, (int)filenames->tableSize);
     FIO_determineHasStdinInput(fCtx, filenames);
     FIO_setNotificationLevel(g_displayLevel);
+    FIO_setAllowBlockDevices(prefs, allowBlockDevices);
     FIO_setPatchFromMode(prefs, patchFromDictFileName != NULL);
     if (memLimit == 0) {
         if (compressionParams.windowLog == 0) {
@@ -1314,6 +1432,7 @@ int main(int const argCount, const char* argv[])
         if (g_ldmBucketSizeLog != LDM_PARAM_DEFAULT) FIO_setLdmBucketSizeLog(prefs, (int)g_ldmBucketSizeLog);
         if (g_ldmHashRateLog != LDM_PARAM_DEFAULT) FIO_setLdmHashRateLog(prefs, (int)g_ldmHashRateLog);
         FIO_setAdaptiveMode(prefs, (unsigned)adapt);
+        FIO_setUseRowMatchFinder(prefs, useRowMatchFinder);
         FIO_setAdaptMin(prefs, adaptMin);
         FIO_setAdaptMax(prefs, adaptMax);
         FIO_setRsyncable(prefs, rsyncable);
@@ -1329,31 +1448,24 @@ int main(int const argCount, const char* argv[])
           assert(ZSTD_NB_STRATEGIES == strategyBounds.upperBound);
           (void)strategyBounds; }
 
-        if (showDefaultCParams) {
+        if (showDefaultCParams || g_displayLevel >= 4) {
             size_t fileNb;
             for (fileNb = 0; fileNb < (size_t)filenames->tableSize; fileNb++) {
-                unsigned long long fileSize = UTIL_getFileSize(filenames->fileNames[fileNb]);
-                const size_t dictSize = dictFileName != NULL ? (size_t)UTIL_getFileSize(dictFileName) : 0;
-                const ZSTD_compressionParameters cParams = ZSTD_getCParams(cLevel, fileSize, dictSize);
-                if (fileSize != UTIL_FILESIZE_UNKNOWN) DISPLAY("%s (%u bytes)\n", filenames->fileNames[fileNb], (unsigned)fileSize);
-                else DISPLAY("%s (src size unknown)\n", filenames->fileNames[fileNb]);
-                DISPLAY(" - windowLog     : %u\n", cParams.windowLog);
-                DISPLAY(" - chainLog      : %u\n", cParams.chainLog);
-                DISPLAY(" - hashLog       : %u\n", cParams.hashLog);
-                DISPLAY(" - searchLog     : %u\n", cParams.searchLog);
-                DISPLAY(" - minMatch      : %u\n", cParams.minMatch);
-                DISPLAY(" - targetLength  : %u\n", cParams.targetLength);
-                assert(cParams.strategy < ZSTD_NB_STRATEGIES + 1);
-                DISPLAY(" - strategy      : %s (%u)\n", ZSTD_strategyMap[(int)cParams.strategy], (unsigned)cParams.strategy);
+                if (showDefaultCParams)
+                    printDefaultCParams(filenames->fileNames[fileNb], dictFileName, cLevel);
+                if (g_displayLevel >= 4)
+                    printActualCParams(filenames->fileNames[fileNb], dictFileName, cLevel, &compressionParams);
             }
         }
 
+        if (g_displayLevel >= 4)
+            FIO_displayCompressionParameters(prefs);
         if ((filenames->tableSize==1) && outFileName)
             operationResult = FIO_compressFilename(fCtx, prefs, outFileName, filenames->fileNames[0], dictFileName, cLevel, compressionParams);
         else
             operationResult = FIO_compressMultipleFilenames(fCtx, prefs, filenames->fileNames, outMirroredDirName, outDirName, outFileName, suffix, dictFileName, cLevel, compressionParams);
 #else
-        (void)contentSize; (void)suffix; (void)adapt; (void)rsyncable; (void)ultra; (void)cLevel; (void)ldmFlag; (void)literalCompressionMode; (void)targetCBlockSize; (void)streamSrcSize; (void)srcSizeHint; (void)ZSTD_strategyMap; /* not used when ZSTD_NOCOMPRESS set */
+        (void)contentSize; (void)suffix; (void)adapt; (void)rsyncable; (void)ultra; (void)cLevel; (void)ldmFlag; (void)literalCompressionMode; (void)targetCBlockSize; (void)streamSrcSize; (void)srcSizeHint; (void)ZSTD_strategyMap; (void)useRowMatchFinder; /* not used when ZSTD_NOCOMPRESS set */
         DISPLAY("Compression not supported \n");
 #endif
     } else {  /* decompression or test */
@@ -1374,6 +1486,9 @@ _end:
     if (main_pause) waitEnter();
     UTIL_freeFileNamesTable(filenames);
     UTIL_freeFileNamesTable(file_of_names);
+#ifndef ZSTD_NOTRACE
+    TRACE_finish();
+#endif
 
     return operationResult;
 }
diff --git a/programs/zstdcli_trace.c b/programs/zstdcli_trace.c
new file mode 100644
index 000000000000..b3b977feb53b
--- /dev/null
+++ b/programs/zstdcli_trace.c
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#include "zstdcli_trace.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "timefn.h"
+#include "util.h"
+
+#define ZSTD_STATIC_LINKING_ONLY
+#include "../lib/zstd.h"
+/* We depend on the trace header to avoid duplicating the ZSTD_trace struct.
+ * But, we check the version so it is compatible with dynamic linking.
+ */
+#include "../lib/common/zstd_trace.h"
+/* We only use macros from threading.h so it is compatible with dynamic linking */
+#include "../lib/common/threading.h"
+
+#if ZSTD_TRACE
+
+static FILE* g_traceFile = NULL;
+static int g_mutexInit = 0;
+static ZSTD_pthread_mutex_t g_mutex;
+static UTIL_time_t g_enableTime = UTIL_TIME_INITIALIZER;
+
+void TRACE_enable(char const* filename)
+{
+    int const writeHeader = !UTIL_isRegularFile(filename);
+    if (g_traceFile)
+        fclose(g_traceFile);
+    g_traceFile = fopen(filename, "a");
+    if (g_traceFile && writeHeader) {
+        /* Fields:
+        * algorithm
+        * version
+        * method
+        * streaming
+        * level
+        * workers
+        * dictionary size
+        * uncompressed size
+        * compressed size
+        * duration nanos
+        * compression ratio
+        * speed MB/s
+        */
+        fprintf(g_traceFile, "Algorithm, Version, Method, Mode, Level, Workers, Dictionary Size, Uncompressed Size, Compressed Size, Duration Nanos, Compression Ratio, Speed MB/s\n");
+    }
+    g_enableTime = UTIL_getTime();
+    if (!g_mutexInit) {
+        if (!ZSTD_pthread_mutex_init(&g_mutex, NULL)) {
+            g_mutexInit = 1;
+        } else {
+            TRACE_finish();
+        }
+    }
+}
+
+void TRACE_finish(void)
+{
+    if (g_traceFile) {
+        fclose(g_traceFile);
+    }
+    g_traceFile = NULL;
+    if (g_mutexInit) {
+        ZSTD_pthread_mutex_destroy(&g_mutex);
+        g_mutexInit = 0;
+    }
+}
+
+static void TRACE_log(char const* method, PTime duration, ZSTD_Trace const* trace)
+{
+    int level = 0;
+    int workers = 0;
+    double const ratio = (double)trace->uncompressedSize / (double)trace->compressedSize;
+    double const speed = ((double)trace->uncompressedSize * 1000) / (double)duration;
+    if (trace->params) {
+        ZSTD_CCtxParams_getParameter(trace->params, ZSTD_c_compressionLevel, &level);
+        ZSTD_CCtxParams_getParameter(trace->params, ZSTD_c_nbWorkers, &workers);
+    }
+    assert(g_traceFile != NULL);
+
+    ZSTD_pthread_mutex_lock(&g_mutex);
+    /* Fields:
+     * algorithm
+     * version
+     * method
+     * streaming
+     * level
+     * workers
+     * dictionary size
+     * uncompressed size
+     * compressed size
+     * duration nanos
+     * compression ratio
+     * speed MB/s
+     */
+    fprintf(g_traceFile,
+        "zstd, %u, %s, %s, %d, %d, %llu, %llu, %llu, %llu, %.2f, %.2f\n",
+        trace->version,
+        method,
+        trace->streaming ? "streaming" : "single-pass",
+        level,
+        workers,
+        (unsigned long long)trace->dictionarySize,
+        (unsigned long long)trace->uncompressedSize,
+        (unsigned long long)trace->compressedSize,
+        (unsigned long long)duration,
+        ratio,
+        speed);
+    ZSTD_pthread_mutex_unlock(&g_mutex);
+}
+
+/**
+ * These symbols override the weak symbols provided by the library.
+ */
+
+ZSTD_TraceCtx ZSTD_trace_compress_begin(ZSTD_CCtx const* cctx)
+{
+    (void)cctx;
+    if (g_traceFile == NULL)
+        return 0;
+    return (ZSTD_TraceCtx)UTIL_clockSpanNano(g_enableTime);
+}
+
+void ZSTD_trace_compress_end(ZSTD_TraceCtx ctx, ZSTD_Trace const* trace)
+{
+    PTime const beginNanos = (PTime)ctx;
+    PTime const endNanos = UTIL_clockSpanNano(g_enableTime);
+    PTime const durationNanos = endNanos > beginNanos ? endNanos - beginNanos : 0;
+    assert(g_traceFile != NULL);
+    assert(trace->version == ZSTD_VERSION_NUMBER); /* CLI version must match. */
+    TRACE_log("compress", durationNanos, trace);
+}
+
+ZSTD_TraceCtx ZSTD_trace_decompress_begin(ZSTD_DCtx const* dctx)
+{
+    (void)dctx;
+    if (g_traceFile == NULL)
+        return 0;
+    return (ZSTD_TraceCtx)UTIL_clockSpanNano(g_enableTime);
+}
+
+void ZSTD_trace_decompress_end(ZSTD_TraceCtx ctx, ZSTD_Trace const* trace)
+{
+    PTime const beginNanos = (PTime)ctx;
+    PTime const endNanos = UTIL_clockSpanNano(g_enableTime);
+    PTime const durationNanos = endNanos > beginNanos ? endNanos - beginNanos : 0;
+    assert(g_traceFile != NULL);
+    assert(trace->version == ZSTD_VERSION_NUMBER); /* CLI version must match. */
+    TRACE_log("decompress", durationNanos, trace);
+}
+
+#else /* ZSTD_TRACE */
+
+void TRACE_enable(char const* filename)
+{
+    (void)filename;
+}
+
+void TRACE_finish(void) {}
+
+#endif /* ZSTD_TRACE */
diff --git a/programs/zstdcli_trace.h b/programs/zstdcli_trace.h
new file mode 100644
index 000000000000..38c27dc04c4f
--- /dev/null
+++ b/programs/zstdcli_trace.h
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTDCLI_TRACE_H
+#define ZSTDCLI_TRACE_H
+
+/**
+ * Enable tracing - log to filename.
+ */
+void TRACE_enable(char const* filename);
+
+/**
+ * Shut down the tracing library.
+ */
+void TRACE_finish(void);
+
+#endif /* ZSTDCLI_TRACE_H */
diff --git a/programs/zstdgrep.1 b/programs/zstdgrep.1
index 4d143b598735..563696d339ee 100644
--- a/programs/zstdgrep.1
+++ b/programs/zstdgrep.1
@@ -1,5 +1,5 @@
 .
-.TH "ZSTDGREP" "1" "December 2020" "zstd 1.4.8" "User Commands"
+.TH "ZSTDGREP" "1" "January 2022" "zstd 1.5.2" "User Commands"
 .
 .SH "NAME"
 \fBzstdgrep\fR \- print lines matching a pattern in zstandard\-compressed files
@@ -8,11 +8,14 @@
 \fBzstdgrep\fR [\fIgrep\-flags\fR] [\-\-] \fIpattern\fR [\fIfiles\fR \.\.\.]
 .
 .SH "DESCRIPTION"
-\fBzstdgrep\fR runs \fBgrep (1)\fR on files or stdin, if no files argument is given, after decompressing them with \fBzstdcat (1)\fR\.
+\fBzstdgrep\fR runs \fBgrep (1)\fR on files, or \fBstdin\fR if no files argument is given, after decompressing them with \fBzstdcat (1)\fR\.
 .
 .P
 The grep\-flags and pattern arguments are passed on to \fBgrep (1)\fR\. If an \fB\-e\fR flag is found in the \fBgrep\-flags\fR, \fBzstdgrep\fR will not look for a pattern argument\.
 .
+.P
+Note that modern \fBgrep\fR alternatives such as \fBripgrep\fR (\fBrg\fR) support \fBzstd\fR\-compressed files out of the box, and can prove better alternatives than \fBzstdgrep\fR notably for unsupported complex pattern searches\. Note though that such alternatives may also feature some minor command line differences\.
+.
 .SH "EXIT STATUS"
 In case of missing arguments or missing pattern, 1 will be returned, otherwise 0\.
 .
diff --git a/programs/zstdgrep.1.md b/programs/zstdgrep.1.md
index 363ad4f9978c..35186a4bf02d 100644
--- a/programs/zstdgrep.1.md
+++ b/programs/zstdgrep.1.md
@@ -9,10 +9,14 @@ SYNOPSIS
 
 DESCRIPTION
 -----------
-`zstdgrep` runs `grep (1)` on files or stdin, if no files argument is given, after decompressing them with `zstdcat (1)`.
+`zstdgrep` runs `grep (1)` on files, or `stdin` if no files argument is given, after decompressing them with `zstdcat (1)`.
 
 The grep-flags and pattern arguments are passed on to `grep (1)`.  If an `-e` flag is found in the `grep-flags`, `zstdgrep` will not look for a pattern argument.
 
+Note that modern `grep` alternatives such as `ripgrep` (`rg`) support `zstd`-compressed files out of the box,
+and can prove better alternatives than `zstdgrep` notably for unsupported complex pattern searches.
+Note though that such alternatives may also feature some minor command line differences.
+
 EXIT STATUS
 -----------
 In case of missing arguments or missing pattern, 1 will be returned, otherwise 0.
diff --git a/programs/zstdless.1 b/programs/zstdless.1
index 43f235453ee6..ab38e7a7f45d 100644
--- a/programs/zstdless.1
+++ b/programs/zstdless.1
@@ -1,5 +1,5 @@
 .
-.TH "ZSTDLESS" "1" "December 2020" "zstd 1.4.8" "User Commands"
+.TH "ZSTDLESS" "1" "January 2022" "zstd 1.5.2" "User Commands"
 .
 .SH "NAME"
 \fBzstdless\fR \- view zstandard\-compressed files