Fix first exercise on mutltiprocessing (empa-scientific-it#190)

baffelli · yakutovicha · web-flow · commit e1898d967d00 · 2023-12-15T13:00:14.000+01:00
* Fixed the solution of exercise 1.

* Removed empty cell.

* Moved all solution functions into reference solution.

* Fixed argument names.

* Added hint on how to handle failing exercise on Ipython.

* Fixed TOC.

* small fixes

---------

Co-authored-by: Aliaksandr Yakutovich &lt;yakutovicha@gmail.com&gt;
diff --git a/threads.ipynb b/threads.ipynb
@@ -15,7 +15,7 @@
     "  - [Parallelism and concurrency in Python](#Parallelism-and-concurrency-in-Python)\n",
     "    - [References](#References)\n",
     "    - [Introduction](#Introduction)\n",
-    "    - [Parallelism Vs. concurrency](#Parallelism-Vs.-concurrency)\n",
+    "    - [Parallelism vs. concurrency](#Parallelism-vs.-concurrency)\n",
     "      - [Parallelism](#Parallelism)\n",
     "      - [Concurrency](#Concurrency)\n",
     "      - [Quiz: parallel or not](#Quiz:-parallel-or-not)\n",
@@ -28,7 +28,7 @@
     "      - [Threads, GIL and the illusion of concurrency](#Threads,-GIL-and-the-illusion-of-concurrency)\n",
     "        - [Threads vs processes](#Threads-vs-processes)\n",
     "        - [When to use threads](#When-to-use-threads)\n",
-    "    - [Asynchronous programming and couroutines: cooperative multitasking](#Asynchronous-programming-and-couroutines:-cooperative-multitasking)\n",
+    "    - [Asynchronous programming and coroutines: cooperative multitasking](#Asynchronous-programming-and-coroutines:-cooperative-multitasking)\n",
     "    - [Exercises](#Exercises)\n",
     "      - [Exercise 1: Counting words in a file🌶️🌶️](#Exercise-1:-Counting-words-in-a-file🌶️🌶️)\n",
     "      - [Exercise 2: Find super secret server key🌶️🌶️🌶️](#Exercise-2:-Find-super-secret-server-key🌶️🌶️🌶️)"
@@ -958,15 +958,6 @@
     "## Exercises"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%reload_ext tutorial.tests.testsuite"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -976,7 +967,7 @@
     "### Exercise 1: Counting words in a file🌶️🌶️\n",
     "\n",
     "Write a **parallel** function `letter_statistics` that returns the statistics of letter counts in the large file `input_file`.\n",
-    "This means that the function should return a **sorted** `Dict[str, int]` containing the counts for each letter in sorted order.\n",
+    "This means that the function should return a  `dict[str, int]` containing the counts for each letter in sorted order.\n",
     "\n",
     "<div class=\"alert alert-block alert-info\">\n",
     "    <h4><b>Hints</b></h4>\n",
@@ -988,13 +979,19 @@
     "           To facilitate your work, we pass the size of the file (in number of characters) using the <code>size</code> argument.\n",
     "        </li>\n",
     "        <li>\n",
+    "            The input <code>n_processes</code> determines how many processes your solution should use.\n",
+    "        </li>\n",
+    "        <li>\n",
     "            Using <code>seek</code> you can specify a line offset from the start of the file. Using <code>read(size)</code> you can read <code>size</code> characters only. \n",
     "        </li>\n",
     "        <li>\n",
     "            Write your function in the cell below inside of the <code>solution_exercise1</code> function. The function receives a <code>Path</code> object <code>input_file</code> as an input and should return a single <code>dict[str, int]</code> dictionary.\n",
     "        </li>\n",
     "        <li>\n",
-    "        Consider using the <code>collections.Counter</code> class to count the number of letters in a string.\n",
+    "            Consider using the <code>collections.Counter</code> class to count the number of letters in a string.\n",
+    "        </li>\n",
+    "        <li>\n",
+    "            In case the test fails with a <code>BrokenProcessPool</code> error, consider moving the definition of your solution in a separate file, importing it in the notebook and calling it from <code>solution_exercise1</code>. \n",
     "        </li>\n",
     "    </ul>\n",
     "<div>\n"
@@ -1008,7 +1005,7 @@
    },
    "outputs": [],
    "source": [
-    "%reload_ext tutorial.tests.testsuite"
+    "%reload_ext tutorial.tests.testsuite\n"
    ]
   },
   {
@@ -1019,15 +1016,13 @@
    },
    "outputs": [],
    "source": [
-    "%%ipytest\n",
-    "from pathlib import Path\n",
-    "from collections import Counter\n",
-    "from concurrent.futures import ProcessPoolExecutor\n",
-    "from multiprocess import Process\n",
+    "%%ipytest \n",
+    "import pathlib\n",
     "\n",
-    "def solution_exercise1(input_file: Path, size: int) -> dict[str, int]:\n",
-    "   \"\"\"Write your solution here\"\"\"\n",
-    "   return {\"a\": 1}"
+    "def solution_exercise1(input_file: pathlib.Path, size: int, n_processes: int) -> dict[str, int]:\n",
+    "    \"\"\"Write your solution here\"\"\"\n",
+    "    return dict()\n",
+    "\n"
    ]
   },
   {
@@ -1115,7 +1110,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.13"
+   "version": "3.11.1"
   }
  },
  "nbformat": 4,
diff --git a/tutorial/tests/test_threads.py b/tutorial/tests/test_threads.py
@@ -5,9 +5,8 @@
 import string
 from collections import Counter
 from concurrent.futures import ProcessPoolExecutor
-from typing import Awaitable, Callable, Dict
+from typing import Awaitable, Callable
 
-import multiprocess
 import pytest
 
 
@@ -56,67 +55,79 @@ def inner_file(size: int = 1000):
     return inner_file
 
 
-def read_segment(file: pathlib.Path, start: int, end: int) -> str:
-    with open(file) as f:
-        f.seek(start)
-        return f.read(end - start)
+def reference_exercise1(
+    input_file: pathlib.Path, size: int, n_processes: int
+) -> dict[str, int]:
+    def read_segment(file: pathlib.Path, start: int, end: int) -> str:
+        with open(file) as f:
+            f.seek(start)
+            return f.read(end - start)
 
+    def segment_stat(segment: str) -> dict[str, int]:
+        return Counter(segment.strip())
 
-def segment_stat(segment: str) -> Dict[str, int]:
-    return Counter(segment.strip())
+    def count_words(
+        file: pathlib.Path, size: int, n_processes: int, segment_index: int
+    ) -> dict[str, int]:
+        segment_size = size // n_processes
+        remainder = size % n_processes
+        start = segment_index * segment_size + min(segment_index, remainder)
+        end = start + segment_size + (1 if segment_index < remainder else 0)
+        return segment_stat(read_segment(file, start, end))
 
-
-def count_words(
-    file: pathlib.Path, size: int, n_processes: int, index: int
-) -> Dict[str, int]:
-    segment_size = size // n_processes
-    start = index * segment_size
-    end = start + segment_size
-    return segment_stat(read_segment(file, start, end))
-
-
-def reference_exercise1(input_path: pathlib.Path, size: int) -> Dict[str, int]:
-    workers = multiprocess.cpu_count()
-    with ProcessPoolExecutor(workers) as executor:
+    with ProcessPoolExecutor(n_processes) as executor:
         result = executor.map(
-            functools.partial(count_words, input_path, size, workers), range(workers)
+            functools.partial(count_words, input_file, size, n_processes),
+            range(n_processes),
         )
     return dict(functools.reduce(lambda x, y: x + y, result, Counter()))
 
 
-@pytest.mark.parametrize("size", [1000, 10000, 100000])
+random_file_sizes = [53, 123, 517, 1000, 10000]
+
+
+@pytest.mark.parametrize(
+    "size, n_processes", [(s, w) for s in random_file_sizes for w in [2, 4, 5, 7]]
+)
 def test_exercise1_total_counts(
     function_to_test: Callable,
     make_random_file: Callable[[None], pathlib.Path],
     size: int,
+    n_processes: int,
 ):
     rf = make_random_file(size)
-    reference_res = reference_exercise1(rf, size)
-    total_letters = sum(reference_res.values())
-    user_res = function_to_test(rf, size)
+    user_res = function_to_test(rf, size, n_processes)
     total_letters_user = sum(user_res.values())
-    assert total_letters == total_letters_user
+    assert total_letters_user == size
 
 
-@pytest.mark.parametrize("size", [1000, 10000, 100000])
+@pytest.mark.parametrize(
+    "size, workers", [(s, w) for s in random_file_sizes for w in [2, 4, 5, 7]]
+)
 def test_exercise1_counts(
     function_to_test: Callable,
     make_random_file: Callable[[None], pathlib.Path],
     size: int,
+    workers: int,
 ):
     rf = make_random_file(size)
-    reference_res = reference_exercise1(rf, size)
-    user_res = function_to_test(rf, size)
-    assert user_res == reference_res
+    # We read the file and use a counter as a trick. It is not parallel but we are
+    # sure it is correct
+    with open(rf) as f:
+        file_content = f.read()
+    # reference_res = count_words_parallel(rf, size, workers)
+    user_res = function_to_test(rf, size, workers)
+    assert user_res == Counter(file_content)
 
 
-# #TODO: find a way to test that the user is using multiprocessing (directly or indirectly)
+# TODO: find a way to test that the user is using multiprocessing (directly or indirectly)
 # def test_exercise1_processes(function_to_test: Callable, make_random_file: Callable[[None], pathlib.Path], monkeypatch: pytest.MonkeyPatch):
-#     with patch.object(multiprocessing.Process, "start") as process_mock:
-#         size = 1000
-#         rf = make_random_file(size)
-#         user_res = function_to_test(rf, size)
-#     assert process_mock.mock_calls or
+#     n_process_mock = MagicMock()
+#     n_process_mock.return_value = 2
+#     size = 1000
+#     rf = make_random_file(size)
+#     user_res = function_to_test(rf, size, n_process_mock)
+#     assert n_process_mock.called
 
 
 def find_word(letters: list[str], separator: str) -> bool: