-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSplitFunctions.cs
102 lines (94 loc) · 3.62 KB
/
SplitFunctions.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
using System;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
namespace CountDistinctWords
{
/// <summary>
/// Splits a line of text to separate words (for a given separator).
/// </summary>
public static class SplitFunctions
{
/// <summary>
/// Split to words by using a given set of characters. If characters are knows
/// this method could be the fastest.
/// </summary>
/// <param name="text"></param>
/// <returns></returns>
public static string[] SplitToWordsNormal(string text)
{
string[] distinctWords = text.Split(' ', '.', ',', '!', '?'); // the list is no full. This is just a test.
return distinctWords;
}
/// <summary>
/// Same as default SplitToWords function. The difference is it calls the distinct on resulting word list.
/// It turns out, the method is slower then calling Distinct() on the whole word list at the end.
/// </summary>
/// <param name="text"></param>
/// <returns></returns>
public static string[] SplitToWordsNormalDistinct(string text)
{
string[] distinctWords = text.Split(' ', '.', ',', '!', '?', '\"').Distinct().ToArray();
return distinctWords;
}
/// <summary>
/// Splits to words using Regex function. Regex is slow for splitting.
/// </summary>
/// <param name="text"></param>
/// <returns></returns>
public static string[] SplitToWordsRegex(string text)
{
//
// Split on all non-word characters.
// ... Returns an array of all the words.
//
return Regex.Split(text, @"\W+");
// @ special verbatim string syntax
// \W+ one or more non-word characters together
}
/// <summary>
/// Splits then removes special scharacters from the words using StringBuilder
/// </summary>
/// <param name="text"></param>
/// <returns></returns>
public static string[] SplitToWordsReplace(string text)
{
//return text.RemoveSpecialCharacters().Split(' '); // doesn't work. It will remove spaces as well so splitting will not work
return text.Split().Select(word => word.RemoveSpecialCharacters()).ToArray();
}
/// <summary>
/// Takes punctuation characters and trims them from the words after splitting.
/// In most cases the fastest
/// </summary>
/// <param name="text"></param>
/// <returns></returns>
public static string[] SplitWithLinq(string text)
{
var punctuation = text.Where(Char.IsPunctuation).Distinct().ToArray();
return text.Split().Select(x => x.Trim(punctuation)).ToArray();
}
public static string[] SplitToWordsTest(string text) // slow
{
var blah = from word in text.Split()
select (string.Concat(from c in word
where char.IsLetter(c)
select c));
return blah.ToArray();
}
}
public static class StringExtensions
{
public static string RemoveSpecialCharacters(this string str)
{
StringBuilder sb = new StringBuilder();
foreach (char c in str)
{
if ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || c == '\'') //(char.IsLetter(c))
{
sb.Append(c);
}
}
return sb.ToString();
}
}
}