[2/3] crit: Anonymize file paths in files.img

Submitted by Harshavardhan Unnibhavi on Sept. 20, 2019, 8:15 a.m.

Details

Message ID 20190920081522.9591-3-hvubfoss@gmail.com
State New
Series "Issue 360: Anonymize image files"
Headers show

Commit Message

Harshavardhan Unnibhavi Sept. 20, 2019, 8:15 a.m.
File path names are replaced by their corresponding sha1 hash values.
The top level names such as bin, var, usr, lib etc, are kept unchanged.

Resolve Issue #360.

Signed-off-by: Harshavardhan Unnibhavi <hvubfoss@gmail.com>
---
 lib/py/anonymize.py | 75 +++++++++++++++++++++++++++++++++++++++++++++
 lib/py/cli.py       |  4 +++
 2 files changed, 79 insertions(+)
 create mode 100644 lib/py/anonymize.py

Patch hide | download patch | download mbox

diff --git a/lib/py/anonymize.py b/lib/py/anonymize.py
new file mode 100644
index 00000000..82b1ab97
--- /dev/null
+++ b/lib/py/anonymize.py
@@ -0,0 +1,75 @@ 
+# This file contains methods to anonymize criu images.
+
+# In order to anonymize images three steps are followed:
+#     - decode the binary image to json
+#     - strip the necessary information from the json dict
+#     - encode the json dict back to a binary image, which is now anonymized
+
+# The following contents are being anonymized:
+#     - Paths to files
+
+import hashlib
+
+
+def files_anon(image):
+    levels = {}
+
+    fname_key = 'reg'
+    checksum = hashlib.sha1()
+
+    for e in image['entries']:
+        if fname_key in e:
+            f_path = e[fname_key]['name']
+
+            f_path = f_path.split('/')
+            lev_num = 0
+
+            for i, p in enumerate(f_path):
+                if p == '':
+                    continue
+                if lev_num not in levels:
+                    levels[lev_num] = {}
+                if p not in levels[lev_num]:
+                    if i == 1:
+                        levels[lev_num][p] = p
+                    else:
+                        checksum.update(p)
+                        levels[lev_num][p] = checksum.hexdigest()
+                lev_num += 1
+
+    for i, e in enumerate(image['entries']):
+        if fname_key in e:
+            f_path = e[fname_key]['name']
+
+            if f_path == '/':
+                continue
+
+            f_path = f_path.split('/')
+            lev_num = 0
+
+            for j, p in enumerate(f_path):
+                if p == '':
+                    continue
+                f_path[j] = levels[lev_num][p]
+                lev_num += 1
+            f_path = '/'.join(f_path)
+            image['entries'][i][fname_key]['name'] = f_path
+
+    return image
+
+
+anonymizers = {
+    'FILES': files_anon
+}
+
+
+def anon_handler(image):
+    magic = image['magic']
+
+    if magic != 'FILES':
+        return -1
+
+    handler = anonymizers[magic]
+    anon_img = handler(image)
+
+    return anon_img
diff --git a/lib/py/cli.py b/lib/py/cli.py
index 980d7fcc..ea0f4438 100755
--- a/lib/py/cli.py
+++ b/lib/py/cli.py
@@ -6,6 +6,7 @@  import os
 import glob
 
 import pycriu
+from anonymize import anon_handler
 
 
 def inf(opts):
@@ -348,6 +349,9 @@  def anonymize(opts):
 
         try:
             img = pycriu.images.load(inf(inf_opts))
+            anon_dict = anon_handler(img)
+            if anon_dict != -1:
+                pycriu.images.dump(anon_dict, outf(inf_opts))
         except pycriu.images.MagicException as exc:
             print("Unknown magic %#x.\n"
                 "Found a raw image" % exc.magic, file=sys.stderr)