In [2]:
from collections import Counter
def triple_counter(s):
c = Counter(s[n-3: n] for n in range(3, len(s)))
for tri, n in c.most_common():
if n > 1:
print('%s - %i times.' % (tri, n))
else:
break
if __name__ == '__main__':
import random
#s = ''.join(random.choice('0123456789') for _ in range(1_000_000))
s = '123412345123456'
triple_counter(s)
In [6]:
from collections import Counter
import random
from multiprocessing import Pool
import os
import math
def triples(s):
return Counter(s[n-3: n] for n in range(3, len(s) + 1))
def triple_print(c):
for tri, n in c.most_common():
if n > 1:
print(' %s - %i times.' % (tri, n))
else:
break
In [7]:
triple_print(triples('123412345123456'))
In [8]:
def calc_in_sections(s):
cores = random.randint(1, min(32, len(s) // 3)) # use this many splits to simulate parallelism
# Splits from 0..n+3, n..2n+3, 2n..3n+3, mn..(m+1)n+3 and (m+1)n+3 == len(s); m == cores
n = math.ceil(len(s) / max(1, cores - 1))
sections = [s[i * n: (i + 1) * n + 3] for i in range(cores)]
sections = [ss for ss in sections if len(ss) >= 3]
counts = map(triples, sections)
double_counted = Counter([s[-3:] for s in sections[:-1]])
tot = sum(counts, Counter()) - double_counted
return tot, cores
In [10]:
tot, _ = calc_in_sections('123412345123456')
triple_print(tot)
In [11]:
def calc_with_concurrency(s):
cores = max(os.cpu_count() -2, 1)
n = math.ceil(len(s) / max(1, cores - 1))
sections = [s[i * n: (i + 1) * n + 3] for i in range(cores)]
sections = [ss for ss in sections if len(ss) >= 3]
with Pool(cores) as pool:
counts = pool.map(triples, sections)
double_counted = Counter([s[-3:] for s in sections[:-1]])
tot = sum(counts, Counter()) - double_counted
return tot
In [ ]:
# triple_print(calc_with_concurrency('123412345123456')) ## See note on execution environment
In [ ]:
if __name__ == '__main__':
s = '123412345123456'
c = triples(s)
assert c == Counter({'123': 3,
'234': 3,
'345': 2,
'341': 1,
'412': 1,
'451': 1,
'456': 1,
'512': 1})
print(f'\n{s} ->')
triple_print(c)
for _ in range(10):
digits = random.randint(10000, 50000)
s = ''.join(random.choice('0123456789') for _ in range(digits))
print(f'Checking {digits} long string for triples calculated in three ways, including using concurrency.')
# In one go
c = triples(s)
# In sections
tot, cores = calc_in_sections(s)
# In parallel
ptot = calc_with_concurrency(s)
assert +c == +tot
assert +c == +ptot
You first solution is the same as MapReduce processing in Hadoop.
ReplyDelete... Okay. Not sure when data size makes hadoop and its harnessing of multiple machines become necessary, but that is another parallelization option, yes.
Delete