Skip to content

Allow clean to keep some metadata keys #672

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jul 28, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 23 additions & 11 deletions nbdev/clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,29 +52,37 @@ def _clean_cell_output(cell):
o.get('metadata', {}).pop('tags', None)

# %% ../nbs/11_clean.ipynb 8
def _clean_cell(cell, clear_all=False):
def _clean_cell(cell, clear_all=False, allowed_metadata_keys=None):
"Clean `cell` by removing superfluous metadata or everything except the input if `clear_all`"
if 'execution_count' in cell: cell['execution_count'] = None
if 'outputs' in cell:
if clear_all: cell['outputs'] = []
else: _clean_cell_output(cell)
if cell['source'] == ['']: cell['source'] = []
cell['metadata'] = {} if clear_all else {
k:v for k,v in cell['metadata'].items() if k=="hide_input"}
k:v for k,v in cell['metadata'].items() if k in allowed_metadata_keys}

# %% ../nbs/11_clean.ipynb 9
def clean_nb(nb, clear_all=False):
def clean_nb(
nb, # The notebook to clean
clear_all=False, # Remove all cell metadata and cell outputs
allowed_metadata_keys:list=None, # Preserve the list of keys in the main notebook metadata
allowed_cell_metadata_keys:list=None # Preserve the list of keys in cell level metadata
):
"Clean `nb` from superfluous metadata"
for c in nb['cells']: _clean_cell(c, clear_all=clear_all)
nb['metadata'] = {k:v for k,v in nb['metadata'].items() if k in
("kernelspec", "jekyll", "jupytext", "doc")}
metadata_keys = {"kernelspec", "jekyll", "jupytext", "doc"}
if allowed_metadata_keys: metadata_keys.update(allowed_metadata_keys)
cell_metadata_keys = {"hide_input"}
if allowed_cell_metadata_keys: cell_metadata_keys.update(allowed_cell_metadata_keys)
for c in nb['cells']: _clean_cell(c, clear_all=clear_all, allowed_metadata_keys=cell_metadata_keys)
nb['metadata'] = {k:v for k,v in nb['metadata'].items() if k in metadata_keys}

# %% ../nbs/11_clean.ipynb 12
# %% ../nbs/11_clean.ipynb 19
def _reconfigure(*strms):
for s in strms:
if hasattr(s,'reconfigure'): s.reconfigure(encoding='utf-8')

# %% ../nbs/11_clean.ipynb 13
# %% ../nbs/11_clean.ipynb 20
def process_write(warn_msg, proc_nb, f_in, f_out=None, disp=False):
if not f_out: f_out = sys.stdout if disp else f_in
if isinstance(f_in, (str,Path)): f_in = Path(f_in).open()
Expand All @@ -87,7 +95,7 @@ def process_write(warn_msg, proc_nb, f_in, f_out=None, disp=False):
warn(f'{warn_msg}')
warn(e)

# %% ../nbs/11_clean.ipynb 14
# %% ../nbs/11_clean.ipynb 21
@call_parse
def nbdev_clean(
fname:str=None, # A notebook name or glob to clean
Expand All @@ -97,14 +105,18 @@ def nbdev_clean(
):
"Clean all notebooks in `fname` to avoid merge conflicts"
# Git hooks will pass the notebooks in stdin
_clean = partial(clean_nb, clear_all=clear_all)
allowed_metadata_keys = config_key("allowed_metadata_keys", default='', missing_ok=True, path=False).split()
allowed_cell_metadata_keys = config_key("allowed_cell_metadata_keys", default='', missing_ok=True, path=False).split()
_clean = partial(clean_nb, clear_all=clear_all,
allowed_metadata_keys=allowed_metadata_keys,
allowed_cell_metadata_keys=allowed_cell_metadata_keys)
_write = partial(process_write, warn_msg='Failed to clean notebook', proc_nb=_clean)
if stdin: return _write(f_in=sys.stdin, f_out=sys.stdout)

if fname is None: fname = config_key("nbs_path", '.', missing_ok=True)
for f in globtastic(fname, file_glob='*.ipynb', skip_folder_re='^[_.]'): _write(f_in=f, disp=disp)

# %% ../nbs/11_clean.ipynb 16
# %% ../nbs/11_clean.ipynb 23
@call_parse
def nbdev_install_hooks():
"Install git hooks to clean and trust notebooks automatically"
Expand Down
123 changes: 98 additions & 25 deletions nbs/11_clean.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -121,15 +121,15 @@
"outputs": [],
"source": [
"#|export\n",
"def _clean_cell(cell, clear_all=False):\n",
"def _clean_cell(cell, clear_all=False, allowed_metadata_keys=None):\n",
" \"Clean `cell` by removing superfluous metadata or everything except the input if `clear_all`\"\n",
" if 'execution_count' in cell: cell['execution_count'] = None\n",
" if 'outputs' in cell:\n",
" if clear_all: cell['outputs'] = []\n",
" else: _clean_cell_output(cell)\n",
" if cell['source'] == ['']: cell['source'] = []\n",
" cell['metadata'] = {} if clear_all else {\n",
" k:v for k,v in cell['metadata'].items() if k==\"hide_input\"}"
" k:v for k,v in cell['metadata'].items() if k in allowed_metadata_keys}"
]
},
{
Expand All @@ -139,11 +139,26 @@
"outputs": [],
"source": [
"#|export\n",
"def clean_nb(nb, clear_all=False):\n",
"def clean_nb(\n",
" nb, # The notebook to clean\n",
" clear_all=False, # Remove all cell metadata and cell outputs\n",
" allowed_metadata_keys:list=None, # Preserve the list of keys in the main notebook metadata\n",
" allowed_cell_metadata_keys:list=None # Preserve the list of keys in cell level metadata\n",
"):\n",
" \"Clean `nb` from superfluous metadata\"\n",
" for c in nb['cells']: _clean_cell(c, clear_all=clear_all)\n",
" nb['metadata'] = {k:v for k,v in nb['metadata'].items() if k in\n",
" (\"kernelspec\", \"jekyll\", \"jupytext\", \"doc\")}"
" metadata_keys = {\"kernelspec\", \"jekyll\", \"jupytext\", \"doc\"}\n",
" if allowed_metadata_keys: metadata_keys.update(allowed_metadata_keys)\n",
" cell_metadata_keys = {\"hide_input\"}\n",
" if allowed_cell_metadata_keys: cell_metadata_keys.update(allowed_cell_metadata_keys)\n",
" for c in nb['cells']: _clean_cell(c, clear_all=clear_all, allowed_metadata_keys=cell_metadata_keys)\n",
" nb['metadata'] = {k:v for k,v in nb['metadata'].items() if k in metadata_keys}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The test notebook has metadata in both the main metadata section and contains cell level metadata in the second cell:"
]
},
{
Expand All @@ -152,24 +167,69 @@
"metadata": {},
"outputs": [],
"source": [
"tst = {'cell_type': 'code', 'execution_count': 26,\n",
" 'metadata': {'hide_input': True, 'meta': 23},\n",
" 'outputs': [{'execution_count': 2,\n",
" 'data': {\n",
" 'application/vnd.google.colaboratory.intrinsic+json': {'type': 'string'},\n",
" 'plain/text': ['sample output',]\n",
" }, 'output': 'super'}],\n",
" 'source': 'awesome_code'}\n",
"nb = {'metadata': {'kernelspec': 'some_spec', 'jekyll': 'some_meta', 'meta': 37}, 'cells': [tst]}\n",
"test_nb = read_nb('../tests/metadata.ipynb')\n",
"\n",
"clean_nb(nb)\n",
"test_eq(nb['cells'][0], {'cell_type': 'code', 'execution_count': None,\n",
" 'metadata': {'hide_input': True},\n",
" 'outputs': [{'execution_count': None, \n",
" 'data': { 'plain/text': ['sample output',]},\n",
" 'output': 'super'}],\n",
" 'source': 'awesome_code'})\n",
"test_eq(nb['metadata'], {'kernelspec': 'some_spec', 'jekyll': 'some_meta'})"
"assert set(['meta', 'jekyll', 'my_extra_key', 'my_removed_key']) <= set(test_nb.metadata.keys())\n",
"assert set(['meta', 'hide_input', 'my_extra_cell_key', 'my_removed_cell_key']) == set(test_nb.cells[1].metadata.keys())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"After cleaning the notebook, all extra metadata is removed, only some keys are allowed by default:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"clean_nb(test_nb)\n",
"\n",
"assert set(['jekyll', 'kernelspec']) == set(test_nb.metadata.keys())\n",
"assert set(['hide_input']) == set(test_nb.cells[1].metadata.keys())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We can preserve some additional keys at the notebook or cell levels:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"test_nb = read_nb('../tests/metadata.ipynb')\n",
"clean_nb(test_nb, allowed_metadata_keys={'my_extra_key'}, allowed_cell_metadata_keys={'my_extra_cell_key'})\n",
"\n",
"assert set(['jekyll', 'kernelspec', 'my_extra_key']) == set(test_nb.metadata.keys())\n",
"assert set(['hide_input', 'my_extra_cell_key']) == set(test_nb.cells[1].metadata.keys())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Passing the `clear_all=True` keyword removes everything from the cell metadata:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"test_nb = read_nb('../tests/metadata.ipynb')\n",
"clean_nb(test_nb, clear_all=True)\n",
"\n",
"assert set(['jekyll', 'kernelspec']) == set(test_nb.metadata.keys())\n",
"test_eq(test_nb.cells[1].metadata, {})"
]
},
{
Expand Down Expand Up @@ -227,7 +287,11 @@
"):\n",
" \"Clean all notebooks in `fname` to avoid merge conflicts\"\n",
" # Git hooks will pass the notebooks in stdin\n",
" _clean = partial(clean_nb, clear_all=clear_all)\n",
" allowed_metadata_keys = config_key(\"allowed_metadata_keys\", default='', missing_ok=True, path=False).split()\n",
" allowed_cell_metadata_keys = config_key(\"allowed_cell_metadata_keys\", default='', missing_ok=True, path=False).split()\n",
" _clean = partial(clean_nb, clear_all=clear_all,\n",
" allowed_metadata_keys=allowed_metadata_keys,\n",
" allowed_cell_metadata_keys=allowed_cell_metadata_keys)\n",
" _write = partial(process_write, warn_msg='Failed to clean notebook', proc_nb=_clean)\n",
" if stdin: return _write(f_in=sys.stdin, f_out=sys.stdout)\n",
" \n",
Expand All @@ -239,7 +303,16 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"By default (`fname` left to `None`), the all the notebooks in `lib_folder` are cleaned. You can opt in to fully clean the notebook by removing every bit of metadata and the cell outputs by passing `clear_all=True`."
"By default (`fname` left to `None`), the all the notebooks in `lib_folder` are cleaned. You can opt in to fully clean the notebook by removing every bit of metadata and the cell outputs by passing `clear_all=True`.\n",
"\n",
"If you want to keep some keys in the main notebook metadata you can set `allowed_metadata_keys` in `settings.ini`.\n",
"Similarly for cell level metadata use: `allowed_cell_metadata_keys`. For example, to preserve both `k1` and `k2` at both the notebook and cell level adding the following in `settings.ini`:\n",
"```\n",
"...\n",
"allowed_metadata_keys = k1 k2\n",
"allowed_cell_metadata_keys = k1 k2\n",
"...\n",
"```"
]
},
{
Expand Down
62 changes: 62 additions & 0 deletions tests/metadata.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## A notebook with metadata"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"hide_input": true,
"my_extra_cell_key": "foo",
"my_removed_cell_key": "foo",
"meta": 37
},
"outputs": [
{
"data": {
"text/plain": [
"2"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# A cell with metadata\n",
"1+1"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.13"
},
"my_extra_key": "foo",
"my_removed_key": "foo",
"jekyll": "some_meta",
"meta": 37
},
"nbformat": 4,
"nbformat_minor": 4
}