[3/3] Issue #360: Anonymize image files

Submitted by Harshavardhan Unnibhavi on June 22, 2019, 9:37 a.m.

Details

Message ID 20190622093732.9024-3-hvubfoss@gmail.com
State New
Series "Series without cover letter"
Headers show

Commit Message

Harshavardhan Unnibhavi June 22, 2019, 9:37 a.m.
This commit adds the file anonymizer function which anonymizes file names present in images.

The anonymized file names are just the shuffled names along the path from root.

Signed-off-by: Harshavardhan Unnibhavi <hvubfoss@gmail.com>
---
 lib/py/cli.py   |  9 ++++++-
 lib/py/strip.py | 66 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 74 insertions(+), 1 deletion(-)
 create mode 100644 lib/py/strip.py

Patch hide | download patch | download mbox

diff --git a/lib/py/cli.py b/lib/py/cli.py
index 17622fd2..4a8efeff 100755
--- a/lib/py/cli.py
+++ b/lib/py/cli.py
@@ -5,6 +5,7 @@  import json
 import os
 
 import pycriu
+import strip
 
 def inf(opts):
 	if opts['in']:
@@ -281,15 +282,21 @@  def anonymize(opts):
 	img_files = os.listdir(opts['in'])
 
 	for i in img_files:
-		temp = {'in':os.path.join(opts['in'], i)}
+		temp = {'in':os.path.join(opts['in'], i), 'out':os.path.join(opts['out'], i)}
 
 		try:
 			m, img = pycriu.images.load(inf(temp), anon_info = True)
+			print("Processing File name:{} with magic:{}".format(i, m))
 		except pycriu.images.MagicException as exc:
 			print("Unknown magic %#x.\n"\
 					"Found a raw image, continuing ..."% exc.magic, file=sys.stderr)
 			continue
 		
+		anon_dict = strip.anon_handler(img, m)
+		if anon_dict != -1:
+			pycriu.images.dump(anon_dict, outf(temp))
+		
+		
 
 explorers = { 'ps': explore_ps, 'fds': explore_fds, 'mems': explore_mems, 'rss': explore_rss }
 
diff --git a/lib/py/strip.py b/lib/py/strip.py
new file mode 100644
index 00000000..4069275c
--- /dev/null
+++ b/lib/py/strip.py
@@ -0,0 +1,66 @@ 
+# This file contains methods to deal with anonymising images.
+#
+# Contents being anonymised can be found at: https://github.com/checkpoint-restore/criu/issues/360
+#
+# Inorder to anonymise the image files three steps are followed:
+#    - decode the binary image to json
+#    - strip the necessary information from the json dict
+#    - encode the json dict back to a binary image, which is now anonymised
+
+import sys
+import json
+import random
+
+def files_anon(image):
+    levels = {}
+
+    for e in image['entries']:
+        f_path = e['reg']['name']
+        f_path = f_path.split('/')
+
+        lev_num = 0
+        for p in f_path:
+            if p == '':
+                continue
+            if lev_num in levels.keys():
+                if p not in levels[lev_num].keys():
+                    temp = list(p)
+                    random.shuffle(temp)
+                    levels[lev_num][p] = ''.join(temp)
+            else:
+                levels[lev_num] = {}
+                temp = list(p)
+                random.shuffle(temp)
+                levels[lev_num][p] = ''.join(temp)
+            lev_num += 1
+        
+    for i, e in enumerate(image['entries']):
+        f_path = e['reg']['name']
+        if f_path == '/':
+            continue
+        f_path = f_path.split('/')
+
+        lev_num = 0
+        for j, p in enumerate(f_path):
+            if p == '':
+                continue
+            f_path[j] = levels[lev_num][p]
+            lev_num += 1
+        f_path = '/'.join(f_path)
+        image['entries'][i]['reg']['name'] = f_path
+    
+    return image
+        
+
+
+
+anonymizers = {
+    'FILES': files_anon,
+    }
+
+def anon_handler(image, magic):
+    if magic != 'FILES':
+        return -1
+    handler = anonymizers[magic]
+    anon_image = handler(image)
+    return anon_image
\ No newline at end of file

Comments

Pavel Emelianov June 24, 2019, noon
On 6/22/19 12:37 PM, Harshavardhan Unnibhavi wrote:
> This commit adds the file anonymizer function which anonymizes file names present in images.
> 
> The anonymized file names are just the shuffled names along the path from root.
> 
> Signed-off-by: Harshavardhan Unnibhavi <hvubfoss@gmail.com>
> ---
>  lib/py/cli.py   |  9 ++++++-
>  lib/py/strip.py | 66 +++++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 74 insertions(+), 1 deletion(-)
>  create mode 100644 lib/py/strip.py
> 
> diff --git a/lib/py/cli.py b/lib/py/cli.py
> index 17622fd2..4a8efeff 100755
> --- a/lib/py/cli.py
> +++ b/lib/py/cli.py
> @@ -5,6 +5,7 @@ import json
>  import os
>  
>  import pycriu
> +import strip
>  
>  def inf(opts):
>  	if opts['in']:
> @@ -281,15 +282,21 @@ def anonymize(opts):
>  	img_files = os.listdir(opts['in'])
>  
>  	for i in img_files:
> -		temp = {'in':os.path.join(opts['in'], i)}
> +		temp = {'in':os.path.join(opts['in'], i), 'out':os.path.join(opts['out'], i)}
>  
>  		try:
>  			m, img = pycriu.images.load(inf(temp), anon_info = True)
> +			print("Processing File name:{} with magic:{}".format(i, m))
>  		except pycriu.images.MagicException as exc:
>  			print("Unknown magic %#x.\n"\
>  					"Found a raw image, continuing ..."% exc.magic, file=sys.stderr)
>  			continue
>  		
> +		anon_dict = strip.anon_handler(img, m)
> +		if anon_dict != -1:
> +			pycriu.images.dump(anon_dict, outf(temp))

A message about skipping the file is needed.

> +		
> +		
>  
>  explorers = { 'ps': explore_ps, 'fds': explore_fds, 'mems': explore_mems, 'rss': explore_rss }
>  
> diff --git a/lib/py/strip.py b/lib/py/strip.py
> new file mode 100644
> index 00000000..4069275c
> --- /dev/null
> +++ b/lib/py/strip.py
> @@ -0,0 +1,66 @@
> +# This file contains methods to deal with anonymising images.
> +#
> +# Contents being anonymised can be found at: https://github.com/checkpoint-restore/criu/issues/360
> +#
> +# Inorder to anonymise the image files three steps are followed:
> +#    - decode the binary image to json
> +#    - strip the necessary information from the json dict
> +#    - encode the json dict back to a binary image, which is now anonymised
> +
> +import sys
> +import json
> +import random
> +
> +def files_anon(image):
> +    levels = {}
> +
> +    for e in image['entries']:
> +        f_path = e['reg']['name']
> +        f_path = f_path.split('/')
> +
> +        lev_num = 0
> +        for p in f_path:
> +            if p == '':
> +                continue
> +            if lev_num in levels.keys():
> +                if p not in levels[lev_num].keys():
> +                    temp = list(p)
> +                    random.shuffle(temp)

Erm, I'm not 100% it's OK to anonymize file paths like that.

> +                    levels[lev_num][p] = ''.join(temp)
> +            else:
> +                levels[lev_num] = {}
> +                temp = list(p)
> +                random.shuffle(temp)
> +                levels[lev_num][p] = ''.join(temp)

Can we factor out these two branches a bit? Smth like

            if lev_num not in levels.keys():
                levels[lev_num] = {}
            if p not in levels[lev_num].keys():
                temp = list(p)
                random.shuffle(temp)
                levels[lev_num][p] = ''.join(temp)

> +            lev_num += 1
> +        
> +    for i, e in enumerate(image['entries']):
> +        f_path = e['reg']['name']
> +        if f_path == '/':
> +            continue
> +        f_path = f_path.split('/')
> +
> +        lev_num = 0
> +        for j, p in enumerate(f_path):
> +            if p == '':
> +                continue
> +            f_path[j] = levels[lev_num][p]
> +            lev_num += 1
> +        f_path = '/'.join(f_path)
> +        image['entries'][i]['reg']['name'] = f_path
> +    
> +    return image
> +        
> +
> +
> +
> +anonymizers = {
> +    'FILES': files_anon,
> +    }
> +

Please, run lint tool on this file, I'm afraid this coding style is not correct. The config
file for lint is in scripts/flake8.cfg.

> +def anon_handler(image, magic):
> +    if magic != 'FILES':
> +        return -1
> +    handler = anonymizers[magic]
> +    anon_image = handler(image)
> +    return anon_image
> \ No newline at end of file
>
Radostin Stoyanov June 24, 2019, 3:15 p.m.
On 24/06/2019 13:00, Pavel Emelianov wrote:
> On 6/22/19 12:37 PM, Harshavardhan Unnibhavi wrote:
>> This commit adds the file anonymizer function which anonymizes file names present in images.
>>
>> The anonymized file names are just the shuffled names along the path from root.
>>
>> Signed-off-by: Harshavardhan Unnibhavi <hvubfoss@gmail.com>
>> ---
>>   lib/py/cli.py   |  9 ++++++-
>>   lib/py/strip.py | 66 +++++++++++++++++++++++++++++++++++++++++++++++++
>>   2 files changed, 74 insertions(+), 1 deletion(-)
>>   create mode 100644 lib/py/strip.py
>>
>> diff --git a/lib/py/cli.py b/lib/py/cli.py
>> index 17622fd2..4a8efeff 100755
>> --- a/lib/py/cli.py
>> +++ b/lib/py/cli.py
>> @@ -5,6 +5,7 @@ import json
>>   import os
>>   
>>   import pycriu
>> +import strip
the keyword 'strip' is very popular (e.g. Python has string.strip() 
method) and using it as a module name may cause confusion, perhaps using 
'anonymize' would be better?
>>   
>>   def inf(opts):
>>   	if opts['in']:
>> @@ -281,15 +282,21 @@ def anonymize(opts):
>>   	img_files = os.listdir(opts['in'])
>>   
>>   	for i in img_files:
>> -		temp = {'in':os.path.join(opts['in'], i)}
>> +		temp = {'in':os.path.join(opts['in'], i), 'out':os.path.join(opts['out'], i)}
>>   
>>   		try:
>>   			m, img = pycriu.images.load(inf(temp), anon_info = True)
>> +			print("Processing File name:{} with magic:{}".format(i, m))
>>   		except pycriu.images.MagicException as exc:
>>   			print("Unknown magic %#x.\n"\
>>   					"Found a raw image, continuing ..."% exc.magic, file=sys.stderr)
>>   			continue
>>   		
>> +		anon_dict = strip.anon_handler(img, m)
>> +		if anon_dict != -1:
>> +			pycriu.images.dump(anon_dict, outf(temp))
> A message about skipping the file is needed.
>
>> +		
>> +		
>>   
>>   explorers = { 'ps': explore_ps, 'fds': explore_fds, 'mems': explore_mems, 'rss': explore_rss }
>>   
>> diff --git a/lib/py/strip.py b/lib/py/strip.py
>> new file mode 100644
>> index 00000000..4069275c
>> --- /dev/null
>> +++ b/lib/py/strip.py
The indentation of this file is using spaces, (IMHO we should be using 
space for python code) however, the rest of the code base is using tabs. 
For consistency it might be better to use tabs in this file as well?
>> @@ -0,0 +1,66 @@
>> +# This file contains methods to deal with anonymising images.
>> +#
>> +# Contents being anonymised can be found at: https://github.com/checkpoint-restore/criu/issues/360
Could you please add the content that is being anonymised instead of 
providing an external link to the github issue? This will be helpful 
when reading the source code offline.
>> +#
>> +# Inorder to anonymise the image files three steps are followed:
s/Inorder/In order/g
>> +#    - decode the binary image to json
>> +#    - strip the necessary information from the json dict
>> +#    - encode the json dict back to a binary image, which is now anonymised
>> +
>> +import sys
>> +import json
>> +import random
>> +
>> +def files_anon(image):
>> +    levels = {}
>> +
>> +    for e in image['entries']:
>> +        f_path = e['reg']['name']
we should handle KeyError: 'reg' or check if the reg key exists.
>> +        f_path = f_path.split('/')
>> +
>> +        lev_num = 0
>> +        for p in f_path:
>> +            if p == '':
>> +                continue
>> +            if lev_num in levels.keys():
>> +                if p not in levels[lev_num].keys():
is .keys() necessary here?
>> +                    temp = list(p)
>> +                    random.shuffle(temp)
> Erm, I'm not 100% it's OK to anonymize file paths like that.
Computing a hash could be another option?
>
>> +                    levels[lev_num][p] = ''.join(temp)
>> +            else:
>> +                levels[lev_num] = {}
>> +                temp = list(p)
>> +                random.shuffle(temp)
>> +                levels[lev_num][p] = ''.join(temp)
> Can we factor out these two branches a bit? Smth like
>
>              if lev_num not in levels.keys():
>                  levels[lev_num] = {}
>              if p not in levels[lev_num].keys():
>                  temp = list(p)
>                  random.shuffle(temp)
>                  levels[lev_num][p] = ''.join(temp)
>
>> +            lev_num += 1
>> +
>> +    for i, e in enumerate(image['entries']):
>> +        f_path = e['reg']['name']
>> +        if f_path == '/':
>> +            continue
>> +        f_path = f_path.split('/')
>> +
>> +        lev_num = 0
>> +        for j, p in enumerate(f_path):
>> +            if p == '':
>> +                continue
>> +            f_path[j] = levels[lev_num][p]
>> +            lev_num += 1
>> +        f_path = '/'.join(f_path)
>> +        image['entries'][i]['reg']['name'] = f_path
>> +
>> +    return image
>> +
>> +
>> +
>> +
>> +anonymizers = {
>> +    'FILES': files_anon,
>> +    }
>> +
> Please, run lint tool on this file, I'm afraid this coding style is not correct. The config
> file for lint is in scripts/flake8.cfg.

To run the lint tool you will need to install flake8:
$ pip install flake8
$ make lint

>> +def anon_handler(image, magic):
>> +    if magic != 'FILES':
>> +        return -1
>> +    handler = anonymizers[magic]
>> +    anon_image = handler(image)
>> +    return anon_image
please add an empty line before the return statement?
>> \ No newline at end of file
please add a new line at end of file?
Pavel Emelianov June 25, 2019, 9:52 a.m.
>>> diff --git a/lib/py/strip.py b/lib/py/strip.py
>>> new file mode 100644
>>> index 00000000..4069275c
>>> --- /dev/null
>>> +++ b/lib/py/strip.py
> The indentation of this file is using spaces, (IMHO we should be using 
> space for python code) however, the rest of the code base is using tabs. 
> For consistency it might be better to use tabs in this file as well?

I've been told a lot that true pythonic indentation is with spaces, not tabs.
That said -- should we take sed and re-format the whole py stuff into spaces?

>>> @@ -0,0 +1,66 @@
>>> +# This file contains methods to deal with anonymising images.
>>> +#
>>> +# Contents being anonymised can be found at: https://github.com/checkpoint-restore/criu/issues/360
> Could you please add the content that is being anonymised instead of 
> providing an external link to the github issue? This will be helpful 
> when reading the source code offline.
>>> +#
>>> +# Inorder to anonymise the image files three steps are followed:
> s/Inorder/In order/g
>>> +#    - decode the binary image to json
>>> +#    - strip the necessary information from the json dict
>>> +#    - encode the json dict back to a binary image, which is now anonymised
>>> +
>>> +import sys
>>> +import json
>>> +import random
>>> +
>>> +def files_anon(image):
>>> +    levels = {}
>>> +
>>> +    for e in image['entries']:
>>> +        f_path = e['reg']['name']
> we should handle KeyError: 'reg' or check if the reg key exists.
>>> +        f_path = f_path.split('/')
>>> +
>>> +        lev_num = 0
>>> +        for p in f_path:
>>> +            if p == '':
>>> +                continue
>>> +            if lev_num in levels.keys():
>>> +                if p not in levels[lev_num].keys():
> is .keys() necessary here?
>>> +                    temp = list(p)
>>> +                    random.shuffle(temp)
>> Erm, I'm not 100% it's OK to anonymize file paths like that.
> Computing a hash could be another option?

Yes.

I was also thinking on checking the 1st level to be one of "known" names like
var, home, usr, etc, etc ( ;) ) and not shuffling them.

-- Pavel
Radostin Stoyanov June 25, 2019, 10:33 a.m.
On 25/06/2019 10:52, Pavel Emelianov wrote:
>>>> diff --git a/lib/py/strip.py b/lib/py/strip.py
>>>> new file mode 100644
>>>> index 00000000..4069275c
>>>> --- /dev/null
>>>> +++ b/lib/py/strip.py
>> The indentation of this file is using spaces, (IMHO we should be using
>> space for python code) however, the rest of the code base is using tabs.
>> For consistency it might be better to use tabs in this file as well?
> I've been told a lot that true pythonic indentation is with spaces, not tabs.
> That said -- should we take sed and re-format the whole py stuff into spaces?
https://www.python.org/dev/peps/pep-0008/#tabs-or-spaces

I think we should. Then we can drop W191 and E101 from the ignore list 
in scripts/flake8.cfg

Radostin
Pavel Emelianov June 25, 2019, 1:19 p.m.
On 6/25/19 2:49 PM, Harshavardhan Unnibhavi wrote:
> On Mon, Jun 24, 2019 at 5:30 PM Pavel Emelianov <xemul@virtuozzo.com> wrote:

When responding on patch comments, please, keep the criu@openvz.org in Cc.
My answers are inline.

>>> +             anon_dict = strip.anon_handler(img, m)
>>> +             if anon_dict != -1:
>>> +                     pycriu.images.dump(anon_dict, outf(temp))
>>
>> A message about skipping the file is needed.
> What extra message should I include? I just borrowed the print
> statement string from the decode function.

Something that prints on the screen that a particular image file has been skipped
from anonymization.

>>> +def files_anon(image):
>>> +    levels = {}
>>> +
>>> +    for e in image['entries']:
>>> +        f_path = e['reg']['name']
>>> +        f_path = f_path.split('/')
>>> +
>>> +        lev_num = 0
>>> +        for p in f_path:
>>> +            if p == '':
>>> +                continue
>>> +            if lev_num in levels.keys():
>>> +                if p not in levels[lev_num].keys():
>>> +                    temp = list(p)
>>> +                    random.shuffle(temp)
>>
>> Erm, I'm not 100% it's OK to anonymize file paths like that.
> Yeah, so basically it transforms /bin/lib to something like
> "nbi/bli/". How would you want me to change it?

Radostin suggested to calculate some checksum on the path component and I agree with him.

>>> +                    levels[lev_num][p] = ''.join(temp)
>>> +            else:
>>> +                levels[lev_num] = {}
>>> +                temp = list(p)
>>> +                random.shuffle(temp)
>>> +                levels[lev_num][p] = ''.join(temp)
>>
>> Can we factor out these two branches a bit? Smth like
>>
>>             if lev_num not in levels.keys():
>>                 levels[lev_num] = {}
>>             if p not in levels[lev_num].keys():
>>                 temp = list(p)
>>                 random.shuffle(temp)
>>                 levels[lev_num][p] = ''.join(temp)
>>
> Sure I will change it.

Thanks!

-- Pavel
Harshavardhan Unnibhavi June 25, 2019, 1:38 p.m.
On Mon, Jun 24, 2019 at 8:45 PM Radostin Stoyanov <rstoyanov1@gmail.com> wrote:
>
> On 24/06/2019 13:00, Pavel Emelianov wrote:
> > On 6/22/19 12:37 PM, Harshavardhan Unnibhavi wrote:
> >> This commit adds the file anonymizer function which anonymizes file names present in images.
> >>
> >> The anonymized file names are just the shuffled names along the path from root.
> >>
> >> Signed-off-by: Harshavardhan Unnibhavi <hvubfoss@gmail.com>
> >> ---
> >>   lib/py/cli.py   |  9 ++++++-
> >>   lib/py/strip.py | 66 +++++++++++++++++++++++++++++++++++++++++++++++++
> >>   2 files changed, 74 insertions(+), 1 deletion(-)
> >>   create mode 100644 lib/py/strip.py
> >>
> >> diff --git a/lib/py/cli.py b/lib/py/cli.py
> >> index 17622fd2..4a8efeff 100755
> >> --- a/lib/py/cli.py
> >> +++ b/lib/py/cli.py
> >> @@ -5,6 +5,7 @@ import json
> >>   import os
> >>
> >>   import pycriu
> >> +import strip
> the keyword 'strip' is very popular (e.g. Python has string.strip()
> method) and using it as a module name may cause confusion, perhaps using
> 'anonymize' would be better?
Yes anonymize would be better name. I will change it.
> >>
> >>   def inf(opts):
> >>      if opts['in']:
> >> @@ -281,15 +282,21 @@ def anonymize(opts):
> >>      img_files = os.listdir(opts['in'])
> >>
> >>      for i in img_files:
> >> -            temp = {'in':os.path.join(opts['in'], i)}
> >> +            temp = {'in':os.path.join(opts['in'], i), 'out':os.path.join(opts['out'], i)}
> >>
> >>              try:
> >>                      m, img = pycriu.images.load(inf(temp), anon_info = True)
> >> +                    print("Processing File name:{} with magic:{}".format(i, m))
> >>              except pycriu.images.MagicException as exc:
> >>                      print("Unknown magic %#x.\n"\
> >>                                      "Found a raw image, continuing ..."% exc.magic, file=sys.stderr)
> >>                      continue
> >>
> >> +            anon_dict = strip.anon_handler(img, m)
> >> +            if anon_dict != -1:
> >> +                    pycriu.images.dump(anon_dict, outf(temp))
> > A message about skipping the file is needed.
> >
> >> +
> >> +
> >>
> >>   explorers = { 'ps': explore_ps, 'fds': explore_fds, 'mems': explore_mems, 'rss': explore_rss }
> >>
> >> diff --git a/lib/py/strip.py b/lib/py/strip.py
> >> new file mode 100644
> >> index 00000000..4069275c
> >> --- /dev/null
> >> +++ b/lib/py/strip.py
> The indentation of this file is using spaces, (IMHO we should be using
> space for python code) however, the rest of the code base is using tabs.
> For consistency it might be better to use tabs in this file as well?
> >> @@ -0,0 +1,66 @@
> >> +# This file contains methods to deal with anonymising images.
> >> +#
> >> +# Contents being anonymised can be found at: https://github.com/checkpoint-restore/criu/issues/360
> Could you please add the content that is being anonymised instead of
> providing an external link to the github issue? This will be helpful
> when reading the source code offline.
Sure.

> >> +#
> >> +# Inorder to anonymise the image files three steps are followed:
> s/Inorder/In order/g
> >> +#    - decode the binary image to json
> >> +#    - strip the necessary information from the json dict
> >> +#    - encode the json dict back to a binary image, which is now anonymised
> >> +
> >> +import sys
> >> +import json
> >> +import random
> >> +
> >> +def files_anon(image):
> >> +    levels = {}
> >> +
> >> +    for e in image['entries']:
> >> +        f_path = e['reg']['name']
> we should handle KeyError: 'reg' or check if the reg key exists.
> >> +        f_path = f_path.split('/')
> >> +
> >> +        lev_num = 0
> >> +        for p in f_path:
> >> +            if p == '':
> >> +                continue
> >> +            if lev_num in levels.keys():
> >> +                if p not in levels[lev_num].keys():
> is .keys() necessary here?
> >> +                    temp = list(p)
> >> +                    random.shuffle(temp)
> > Erm, I'm not 100% it's OK to anonymize file paths like that.
> Computing a hash could be another option?
Yes I will look into it.
> >
> >> +                    levels[lev_num][p] = ''.join(temp)
> >> +            else:
> >> +                levels[lev_num] = {}
> >> +                temp = list(p)
> >> +                random.shuffle(temp)
> >> +                levels[lev_num][p] = ''.join(temp)
> > Can we factor out these two branches a bit? Smth like
> >
> >              if lev_num not in levels.keys():
> >                  levels[lev_num] = {}
> >              if p not in levels[lev_num].keys():
> >                  temp = list(p)
> >                  random.shuffle(temp)
> >                  levels[lev_num][p] = ''.join(temp)
> >
> >> +            lev_num += 1
> >> +
> >> +    for i, e in enumerate(image['entries']):
> >> +        f_path = e['reg']['name']
> >> +        if f_path == '/':
> >> +            continue
> >> +        f_path = f_path.split('/')
> >> +
> >> +        lev_num = 0
> >> +        for j, p in enumerate(f_path):
> >> +            if p == '':
> >> +                continue
> >> +            f_path[j] = levels[lev_num][p]
> >> +            lev_num += 1
> >> +        f_path = '/'.join(f_path)
> >> +        image['entries'][i]['reg']['name'] = f_path
> >> +
> >> +    return image
> >> +
> >> +
> >> +
> >> +
> >> +anonymizers = {
> >> +    'FILES': files_anon,
> >> +    }
> >> +
> > Please, run lint tool on this file, I'm afraid this coding style is not correct. The config
> > file for lint is in scripts/flake8.cfg.
>
> To run the lint tool you will need to install flake8:
> $ pip install flake8
> $ make lint
>
> >> +def anon_handler(image, magic):
> >> +    if magic != 'FILES':
> >> +        return -1
> >> +    handler = anonymizers[magic]
> >> +    anon_image = handler(image)
> >> +    return anon_image
> please add an empty line before the return statement?
Sure.
> >> \ No newline at end of file
> please add a new line at end of file?
Harshavardhan Unnibhavi June 25, 2019, 1:39 p.m.
On Tue, Jun 25, 2019 at 3:22 PM Pavel Emelianov <xemul@virtuozzo.com> wrote:
>
> >>> diff --git a/lib/py/strip.py b/lib/py/strip.py
> >>> new file mode 100644
> >>> index 00000000..4069275c
> >>> --- /dev/null
> >>> +++ b/lib/py/strip.py
> > The indentation of this file is using spaces, (IMHO we should be using
> > space for python code) however, the rest of the code base is using tabs.
> > For consistency it might be better to use tabs in this file as well?
>
> I've been told a lot that true pythonic indentation is with spaces, not tabs.
> That said -- should we take sed and re-format the whole py stuff into spaces?
>
> >>> @@ -0,0 +1,66 @@
> >>> +# This file contains methods to deal with anonymising images.
> >>> +#
> >>> +# Contents being anonymised can be found at: https://github.com/checkpoint-restore/criu/issues/360
> > Could you please add the content that is being anonymised instead of
> > providing an external link to the github issue? This will be helpful
> > when reading the source code offline.
> >>> +#
> >>> +# Inorder to anonymise the image files three steps are followed:
> > s/Inorder/In order/g
> >>> +#    - decode the binary image to json
> >>> +#    - strip the necessary information from the json dict
> >>> +#    - encode the json dict back to a binary image, which is now anonymised
> >>> +
> >>> +import sys
> >>> +import json
> >>> +import random
> >>> +
> >>> +def files_anon(image):
> >>> +    levels = {}
> >>> +
> >>> +    for e in image['entries']:
> >>> +        f_path = e['reg']['name']
> > we should handle KeyError: 'reg' or check if the reg key exists.
> >>> +        f_path = f_path.split('/')
> >>> +
> >>> +        lev_num = 0
> >>> +        for p in f_path:
> >>> +            if p == '':
> >>> +                continue
> >>> +            if lev_num in levels.keys():
> >>> +                if p not in levels[lev_num].keys():
> > is .keys() necessary here?
> >>> +                    temp = list(p)
> >>> +                    random.shuffle(temp)
> >> Erm, I'm not 100% it's OK to anonymize file paths like that.
> > Computing a hash could be another option?
>
> Yes.
>
> I was also thinking on checking the 1st level to be one of "known" names like
> var, home, usr, etc, etc ( ;) ) and not shuffling them.

Sure I can make that change.
>
> -- Pavel

Harsha