-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathutils.py
67 lines (57 loc) · 2.64 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
__author__ = 'dowling'
import itertools
import numpy
# These are some utility functions, taken from Gensim by Radim Rekurek. They're included here for portability only.
# Copyright (C) 2010 Radim Rehurek <[email protected]>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
def chunkize_serial(iterable, chunksize, as_numpy=False):
"""
Return elements from the iterable in `chunksize`-ed lists. The last returned
element may be smaller (if length of collection is not divisible by `chunksize`).
>>> print(list(grouper(range(10), 3)))
[[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]
"""
it = iter(iterable)
while True:
if as_numpy:
# convert each document to a 2d numpy array (~6x faster when transmitting
# chunk data over the wire, in Pyro)
wrapped_chunk = [[numpy.array(doc) for doc in itertools.islice(it, int(chunksize))]]
else:
wrapped_chunk = [list(itertools.islice(it, int(chunksize)))]
if not wrapped_chunk[0]:
break
# memory opt: wrap the chunk and then pop(), to avoid leaving behind a dangling reference
yield wrapped_chunk.pop()
grouper = chunkize_serial
def is_corpus(obj):
"""
Check whether `obj` is a corpus. Return (is_corpus, new) 2-tuple, where
`new is obj` if `obj` was an iterable, or `new` yields the same sequence as
`obj` if it was an iterator.
`obj` is a corpus if it supports iteration over documents, where a document
is in turn anything that acts as a sequence of 2-tuples (int, float).
Note: An "empty" corpus (empty input sequence) is ambiguous, so in this case the
result is forcefully defined as `is_corpus=False`.
"""
try:
if 'Corpus' in obj.__class__.__name__: # the most common case, quick hack
return True, obj
except:
pass
try:
if hasattr(obj, 'next'):
# the input is an iterator object, meaning once we call next()
# that element could be gone forever. we must be careful to put
# whatever we retrieve back again
doc1 = next(obj)
obj = itertools.chain([doc1], obj)
else:
doc1 = next(iter(obj)) # empty corpus is resolved to False here
if len(doc1) == 0: # sparse documents must have a __len__ function (list, tuple...)
return True, obj # the first document is empty=>assume this is a corpus
id1, val1 = next(iter(doc1)) # if obj is a numpy array, it resolves to False here
id1, val1 = int(id1), float(val1) # must be a 2-tuple (integer, float)
except:
return False, obj
return True, obj