Skip to content

Commit e1898d9

Browse files
Fix first exercise on mutltiprocessing (empa-scientific-it#190)
* Fixed the solution of exercise 1. * Removed empty cell. * Moved all solution functions into reference solution. * Fixed argument names. * Added hint on how to handle failing exercise on Ipython. * Fixed TOC. * small fixes --------- Co-authored-by: Aliaksandr Yakutovich <yakutovicha@gmail.com>
1 parent e7bdeec commit e1898d9

File tree

2 files changed

+66
-60
lines changed

2 files changed

+66
-60
lines changed

threads.ipynb

Lines changed: 18 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
" - [Parallelism and concurrency in Python](#Parallelism-and-concurrency-in-Python)\n",
1616
" - [References](#References)\n",
1717
" - [Introduction](#Introduction)\n",
18-
" - [Parallelism Vs. concurrency](#Parallelism-Vs.-concurrency)\n",
18+
" - [Parallelism vs. concurrency](#Parallelism-vs.-concurrency)\n",
1919
" - [Parallelism](#Parallelism)\n",
2020
" - [Concurrency](#Concurrency)\n",
2121
" - [Quiz: parallel or not](#Quiz:-parallel-or-not)\n",
@@ -28,7 +28,7 @@
2828
" - [Threads, GIL and the illusion of concurrency](#Threads,-GIL-and-the-illusion-of-concurrency)\n",
2929
" - [Threads vs processes](#Threads-vs-processes)\n",
3030
" - [When to use threads](#When-to-use-threads)\n",
31-
" - [Asynchronous programming and couroutines: cooperative multitasking](#Asynchronous-programming-and-couroutines:-cooperative-multitasking)\n",
31+
" - [Asynchronous programming and coroutines: cooperative multitasking](#Asynchronous-programming-and-coroutines:-cooperative-multitasking)\n",
3232
" - [Exercises](#Exercises)\n",
3333
" - [Exercise 1: Counting words in a file🌶️🌶️](#Exercise-1:-Counting-words-in-a-file🌶️🌶️)\n",
3434
" - [Exercise 2: Find super secret server key🌶️🌶️🌶️](#Exercise-2:-Find-super-secret-server-key🌶️🌶️🌶️)"
@@ -958,15 +958,6 @@
958958
"## Exercises"
959959
]
960960
},
961-
{
962-
"cell_type": "code",
963-
"execution_count": null,
964-
"metadata": {},
965-
"outputs": [],
966-
"source": [
967-
"%reload_ext tutorial.tests.testsuite"
968-
]
969-
},
970961
{
971962
"cell_type": "markdown",
972963
"metadata": {},
@@ -976,7 +967,7 @@
976967
"### Exercise 1: Counting words in a file🌶️🌶️\n",
977968
"\n",
978969
"Write a **parallel** function `letter_statistics` that returns the statistics of letter counts in the large file `input_file`.\n",
979-
"This means that the function should return a **sorted** `Dict[str, int]` containing the counts for each letter in sorted order.\n",
970+
"This means that the function should return a `dict[str, int]` containing the counts for each letter in sorted order.\n",
980971
"\n",
981972
"<div class=\"alert alert-block alert-info\">\n",
982973
" <h4><b>Hints</b></h4>\n",
@@ -988,13 +979,19 @@
988979
" To facilitate your work, we pass the size of the file (in number of characters) using the <code>size</code> argument.\n",
989980
" </li>\n",
990981
" <li>\n",
982+
" The input <code>n_processes</code> determines how many processes your solution should use.\n",
983+
" </li>\n",
984+
" <li>\n",
991985
" Using <code>seek</code> you can specify a line offset from the start of the file. Using <code>read(size)</code> you can read <code>size</code> characters only. \n",
992986
" </li>\n",
993987
" <li>\n",
994988
" Write your function in the cell below inside of the <code>solution_exercise1</code> function. The function receives a <code>Path</code> object <code>input_file</code> as an input and should return a single <code>dict[str, int]</code> dictionary.\n",
995989
" </li>\n",
996990
" <li>\n",
997-
" Consider using the <code>collections.Counter</code> class to count the number of letters in a string.\n",
991+
" Consider using the <code>collections.Counter</code> class to count the number of letters in a string.\n",
992+
" </li>\n",
993+
" <li>\n",
994+
" In case the test fails with a <code>BrokenProcessPool</code> error, consider moving the definition of your solution in a separate file, importing it in the notebook and calling it from <code>solution_exercise1</code>. \n",
998995
" </li>\n",
999996
" </ul>\n",
1000997
"<div>\n"
@@ -1008,7 +1005,7 @@
10081005
},
10091006
"outputs": [],
10101007
"source": [
1011-
"%reload_ext tutorial.tests.testsuite"
1008+
"%reload_ext tutorial.tests.testsuite\n"
10121009
]
10131010
},
10141011
{
@@ -1019,15 +1016,13 @@
10191016
},
10201017
"outputs": [],
10211018
"source": [
1022-
"%%ipytest\n",
1023-
"from pathlib import Path\n",
1024-
"from collections import Counter\n",
1025-
"from concurrent.futures import ProcessPoolExecutor\n",
1026-
"from multiprocess import Process\n",
1019+
"%%ipytest \n",
1020+
"import pathlib\n",
10271021
"\n",
1028-
"def solution_exercise1(input_file: Path, size: int) -> dict[str, int]:\n",
1029-
" \"\"\"Write your solution here\"\"\"\n",
1030-
" return {\"a\": 1}"
1022+
"def solution_exercise1(input_file: pathlib.Path, size: int, n_processes: int) -> dict[str, int]:\n",
1023+
" \"\"\"Write your solution here\"\"\"\n",
1024+
" return dict()\n",
1025+
"\n"
10311026
]
10321027
},
10331028
{
@@ -1115,7 +1110,7 @@
11151110
"name": "python",
11161111
"nbconvert_exporter": "python",
11171112
"pygments_lexer": "ipython3",
1118-
"version": "3.10.13"
1113+
"version": "3.11.1"
11191114
}
11201115
},
11211116
"nbformat": 4,

tutorial/tests/test_threads.py

Lines changed: 48 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,8 @@
55
import string
66
from collections import Counter
77
from concurrent.futures import ProcessPoolExecutor
8-
from typing import Awaitable, Callable, Dict
8+
from typing import Awaitable, Callable
99

10-
import multiprocess
1110
import pytest
1211

1312

@@ -56,67 +55,79 @@ def inner_file(size: int = 1000):
5655
return inner_file
5756

5857

59-
def read_segment(file: pathlib.Path, start: int, end: int) -> str:
60-
with open(file) as f:
61-
f.seek(start)
62-
return f.read(end - start)
58+
def reference_exercise1(
59+
input_file: pathlib.Path, size: int, n_processes: int
60+
) -> dict[str, int]:
61+
def read_segment(file: pathlib.Path, start: int, end: int) -> str:
62+
with open(file) as f:
63+
f.seek(start)
64+
return f.read(end - start)
6365

66+
def segment_stat(segment: str) -> dict[str, int]:
67+
return Counter(segment.strip())
6468

65-
def segment_stat(segment: str) -> Dict[str, int]:
66-
return Counter(segment.strip())
69+
def count_words(
70+
file: pathlib.Path, size: int, n_processes: int, segment_index: int
71+
) -> dict[str, int]:
72+
segment_size = size // n_processes
73+
remainder = size % n_processes
74+
start = segment_index * segment_size + min(segment_index, remainder)
75+
end = start + segment_size + (1 if segment_index < remainder else 0)
76+
return segment_stat(read_segment(file, start, end))
6777

68-
69-
def count_words(
70-
file: pathlib.Path, size: int, n_processes: int, index: int
71-
) -> Dict[str, int]:
72-
segment_size = size // n_processes
73-
start = index * segment_size
74-
end = start + segment_size
75-
return segment_stat(read_segment(file, start, end))
76-
77-
78-
def reference_exercise1(input_path: pathlib.Path, size: int) -> Dict[str, int]:
79-
workers = multiprocess.cpu_count()
80-
with ProcessPoolExecutor(workers) as executor:
78+
with ProcessPoolExecutor(n_processes) as executor:
8179
result = executor.map(
82-
functools.partial(count_words, input_path, size, workers), range(workers)
80+
functools.partial(count_words, input_file, size, n_processes),
81+
range(n_processes),
8382
)
8483
return dict(functools.reduce(lambda x, y: x + y, result, Counter()))
8584

8685

87-
@pytest.mark.parametrize("size", [1000, 10000, 100000])
86+
random_file_sizes = [53, 123, 517, 1000, 10000]
87+
88+
89+
@pytest.mark.parametrize(
90+
"size, n_processes", [(s, w) for s in random_file_sizes for w in [2, 4, 5, 7]]
91+
)
8892
def test_exercise1_total_counts(
8993
function_to_test: Callable,
9094
make_random_file: Callable[[None], pathlib.Path],
9195
size: int,
96+
n_processes: int,
9297
):
9398
rf = make_random_file(size)
94-
reference_res = reference_exercise1(rf, size)
95-
total_letters = sum(reference_res.values())
96-
user_res = function_to_test(rf, size)
99+
user_res = function_to_test(rf, size, n_processes)
97100
total_letters_user = sum(user_res.values())
98-
assert total_letters == total_letters_user
101+
assert total_letters_user == size
99102

100103

101-
@pytest.mark.parametrize("size", [1000, 10000, 100000])
104+
@pytest.mark.parametrize(
105+
"size, workers", [(s, w) for s in random_file_sizes for w in [2, 4, 5, 7]]
106+
)
102107
def test_exercise1_counts(
103108
function_to_test: Callable,
104109
make_random_file: Callable[[None], pathlib.Path],
105110
size: int,
111+
workers: int,
106112
):
107113
rf = make_random_file(size)
108-
reference_res = reference_exercise1(rf, size)
109-
user_res = function_to_test(rf, size)
110-
assert user_res == reference_res
114+
# We read the file and use a counter as a trick. It is not parallel but we are
115+
# sure it is correct
116+
with open(rf) as f:
117+
file_content = f.read()
118+
# reference_res = count_words_parallel(rf, size, workers)
119+
user_res = function_to_test(rf, size, workers)
120+
assert user_res == Counter(file_content)
111121

112122

113-
# #TODO: find a way to test that the user is using multiprocessing (directly or indirectly)
123+
# TODO: find a way to test that the user is using multiprocessing (directly or indirectly)
114124
# def test_exercise1_processes(function_to_test: Callable, make_random_file: Callable[[None], pathlib.Path], monkeypatch: pytest.MonkeyPatch):
115-
# with patch.object(multiprocessing.Process, "start") as process_mock:
116-
# size = 1000
117-
# rf = make_random_file(size)
118-
# user_res = function_to_test(rf, size)
119-
# assert process_mock.mock_calls or
125+
# n_process_mock = MagicMock()
126+
# n_process_mock.return_value = 2
127+
# size = 1000
128+
# rf = make_random_file(size)
129+
# user_res = function_to_test(rf, size, n_process_mock)
130+
# assert n_process_mock.called
120131

121132

122133
def find_word(letters: list[str], separator: str) -> bool:

0 commit comments

Comments
 (0)