Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support National Language Shift Tables #16

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 18 additions & 5 deletions lib/gsmsplitter.js
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
var gsmvalidator = require('./gsmvalidator');
var gsmValidator = require('./gsmvalidator');

function isHighSurrogate(code) {
return code >= 0xD800 && code <= 0xDBFF;
}

module.exports.split = function (message, options) {
options = options || { summary: false };
options = options || { supportShiftTables: false, summary: false };

if (message === '') {
return {
Expand All @@ -26,7 +26,6 @@ module.exports.split = function (message, options) {
var totalLength = 0;
var messagePart = '';


function bank() {
var msg = {
content: options.summary ? undefined : messagePart,
Expand All @@ -42,15 +41,29 @@ module.exports.split = function (message, options) {
messagePart = '';
}

function validateCharacter(character) {
if (options.supportShiftTables) {
return gsmValidator.validateCharacterWithShiftTable(character);
}
return gsmValidator.validateCharacter(character);
}

function validateExtendedCharacter(character) {
if (options.supportShiftTables) {
return gsmValidator.validateExtendedCharacterWithShiftTable(character);
}
return gsmValidator.validateExtendedCharacter(character);
}

for (var i = 0, count = message.length; i < count; i++) {
var c = message.charAt(i);

if (!gsmvalidator.validateCharacter(c)) {
if (!validateCharacter(c)) {
if (isHighSurrogate(c.charCodeAt(0))) {
i++;
}
c = '\u0020';
} else if (gsmvalidator.validateExtendedCharacter(c)) {
} else if (validateExtendedCharacter(c)) {
if (bytes === 152) bank();
bytes++;
}
Expand Down
222 changes: 164 additions & 58 deletions lib/gsmvalidator.js
Original file line number Diff line number Diff line change
@@ -1,58 +1,164 @@
// '@£$¥èéùìòÇ\nØø\rÅåΔ_ΦΓΛΩΠΨΣΘΞÆæßÉ\x20!"#¤%&\'()*+,-./0123456789:;<=>?¡ABCDEFGHIJKLMNOPQRSTUVWXYZÄÖÑܧ¿abcdefghijklmnopqrstuvwxyzäöñüà\f^{}\\[~]|€'
var GSM_charCodes = [
10,12,13,32,33,34,35,36,
37,38,39,40,41,42,43,44,
45,46,47,48,49,50,51,52,
53,54,55,56,57,58,59,60,
61,62,63,64,65,66,67,68,
69,70,71,72,73,74,75,76,
77,78,79,80,81,82,83,84,
85,86,87,88,89,90,91,92,
93,94,95,97,98,99,100,101,
102,103,104,105,106,107,108,
109,110,111,112,113,114,115,
116,117,118,119,120,121,122,
123,124,125,126,161,163,164,
165,167,191,196,197,198,199,
201,209,214,216,220,223,224,
228,229,230,232,233,236,241,
242,246,248,249,252,915,916,
920,923,926,928,931,934,936,
937,8364
];

// '\f|^€{}[~]\\'
var GSMe_charCodes = [12,91,92,93,94,123,124,125,126,8364];

function existsInArray(code, array) {
var len = array.length;
var i = 0;
while (i < len) {
var e = array[i];
if (code === e) return true;
i++;
}
return false;
}

function validateCharacter(character) {
var code = character.charCodeAt(0);
return existsInArray(code, GSM_charCodes);
}

function validateMessage(message) {
for (var i = 0; i < message.length; i++) {
if (!validateCharacter(message.charAt(i)))
return false;
}
return true;
}

function validateExtendedCharacter(character) {
var code = character.charCodeAt(0);
return existsInArray(code, GSMe_charCodes);
}

module.exports.validateCharacter = validateCharacter;
module.exports.validateMessage = validateMessage;
module.exports.validateExtendedCharacter = validateExtendedCharacter;
// '@£$¥èéùìòÇ\nØø\rÅåΔ_ΦΓΛΩΠΨΣΘΞÆæßÉ\x20!"#¤%&\'()*+,-./0123456789:;<=>?¡ABCDEFGHIJKLMNOPQRSTUVWXYZÄÖÑܧ¿abcdefghijklmnopqrstuvwxyzäöñüà\f^{}\\[~]|€'
var GSM_charCodes = [
10,12,13,32,33,34,35,36,
37,38,39,40,41,42,43,44,
45,46,47,48,49,50,51,52,
53,54,55,56,57,58,59,60,
61,62,63,64,65,66,67,68,
69,70,71,72,73,74,75,76,
77,78,79,80,81,82,83,84,
85,86,87,88,89,90,91,92,
93,94,95,97,98,99,100,101,
102,103,104,105,106,107,108,
109,110,111,112,113,114,115,
116,117,118,119,120,121,122,
123,124,125,126,161,163,164,
165,167,191,196,197,198,199,
201,209,214,216,220,223,224,
228,229,230,232,233,236,241,
242,246,248,249,252,915,916,
920,923,926,928,931,934,936,
937,8364
];

// '\f|^€{}[~]\\'
var GSMe_charCodes = [12,91,92,93,94,123,124,125,126,8364];

// '@£$¥€éùıòÇ\nĞğ\rÅåΔ_ΦΓΛΩΠΨΣΘΞŞşßÉ\x20!"#¤%&\'()*+,-./0123456789:;<=>?İABCDEFGHIJKLMNOPQRSTUVWXYZÄÖÑܧçabcdefghijklmnopqrstuvwxyzäöñüà\f^{}\[~]|'
var GSM_TR_charCodes = [
10,12,13,32,33,34,35,36,
37,38,39,40,41,42,43,44,
45,46,47,48,49,50,51,52,
53,54,55,56,57,58,59,60,
61,62,63,64,65,66,67,68,
69,70,71,72,73,74,75,76,
77,78,79,80,81,82,83,84,
85,86,87,88,89,90,91,92,
93,94,95,97,98,99,100,101,
102,103,104,105,106,107,108,
109,110,111,112,113,114,115,
116,117,118,119,120,121,122,
123,124,125,126,163,164,165,
167,196,197,199,201,209,214,
220,223,224,228,229,231,233,
241,242,246,249,252,286,287,
304,305,350,351,915,916,920,
923,926,928,931,934,936,937,
8364
];

// '\f^{}\[~]|'
var GSMe_TR_charCodes = [12,91,92,93,94,123,124,125,126,286,287,304,305,350,351,8364];

// '@£$¥èéùìòÇ\nØø\rÅåΔ_ΦΓΛΩΠΨΣΘΞÆæßÉ\x20!"#¤%&\'()*+,-./0123456789:;<=>?¡ABCDEFGHIJKLMNOPQRSTUVWXYZÄÖÑܧ¿abcdefghijklmnopqrstuvwxyzäöñüàç\f^{}\\[~]|ÁÍÓÚá€íóú'
var GSM_ES_charCodes = [
10,12,13,32,33,34,35,36,
37,38,39,40,41,42,43,44,
45,46,47,48,49,50,51,52,
53,54,55,56,57,58,59,60,
61,62,63,64,65,66,67,68,
69,70,71,72,73,74,75,76,
77,78,79,80,81,82,83,84,
85,86,87,88,89,90,91,92,
93,94,95,97,98,99,100,101,
102,103,104,105,106,107,108,
109,110,111,112,113,114,115,
116,117,118,119,120,121,122,
123,124,125,126,161,163,164,
165,167,191,193,196,197,198,
199,201,205,209,211,214,216,
218,220,223,224,225,228,229,
230,231,232,233,236,237,241,
242,243,246,248,249,250,252,
915,916,920,923,926,928,931,
934,936,937,8364
];

// 'ç\f^{}\\[~]|ÁÍÓÚá€íóú'
var GSMe_ES_charCodes = [12,91,92,93,94,123,124,125,126,193,205,211,218,225,231,237,243,250,8364];

// '@£$¥êéúíóç\nÔô\rÁáΔ_ªÇÀ∞^\\€Ó|ÂâÊÉ\x20!"#º%&\'()*+,-./0123456789:;<=>?ÍABCDEFGHIJKLMNOPQRSTUVWXYZÃÕÚܧ~abcdefghijklmnopqrstuvwxyzãõ`üà\fΦΓ^ΩΠΨΣΘ{}\\[~]|'
var GSM_PT_charCodes = [
10,12,13,32,33,34,35,36,
37,38,39,40,41,42,43,44,
45,46,47,48,49,50,51,52,
53,54,55,56,57,58,59,60,
61,62,63,64,65,66,67,68,
69,70,71,72,73,74,75,76,
77,78,79,80,81,82,83,84,
85,86,87,88,89,90,91,92,
93,94,95,96,97,98,99,100,
101,102,103,104,105,106,107,108,
109,110,111,112,113,114,115,116,
117,118,119,120,121,122,123,124,
125,126,163,165,167,170,186,192,
193,194,195,199,201,202,205,211,
212,213,218,220,224,225,226,227,
231,233,234,237,242,243,244,245,
250,252,915,916,920,928,931,934,
936,937,8364,8734
];

// '\fΦΓ^ΩΠΨΣΘ{}\\[~]|'
var GSMe_PT_charCodes = [
12,91,92,93,94,123,124,125,
126,193,194,195,202,205,211,212,
213,218,225,226,227,231,234,237,
242,243,245,250,915,920,928,931,
934,936,937,8364
];

function existsInArray(code, array) {
var len = array.length;
var i = 0;
while (i < len) {
var e = array[i];
if (code === e) return true;
i++;
}
return false;
}

function validateCharacter(character) {
return existsInArray(character.charCodeAt(0), GSM_charCodes);
}
function validateCharacterWithShiftTable(character) {
var charCodes = GSM_charCodes.concat(GSM_TR_charCodes, GSM_ES_charCodes, GSM_PT_charCodes);
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can see how this appears to make sense if all you need to do is count all possible valid characters that exist in any shift table, but this creates an invalid situation where the whole text doesn't fit any common shift table, and should be detected as Unicode. Take the following two messages:

  1. valid in only Portuguese language (Latin script)
  2. Ø valid in either Spanish language (Latin script) or the Basic Character Set

If each example was a full message, they would each be valid. However, if we take a message containing both characters:

∞Ø

As far as we know, this is an invalid message because there is no common shift table that supports both at the same time, so it should be detected as Unicode.

This library's character set auto-detection mechanism is pretty important, and we should think about how that can be preserved.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've updated the PR, when validating a whole message with this method is not used anymore, review message below

return existsInArray(character.charCodeAt(0), charCodes);
}

function validateMessageInCharCodesList(message, charCodes) {
for (var i = 0; i < message.length; i++) {
if (!existsInArray(message.charCodeAt(i), charCodes))
return false;
}

return true;
}
function validateMessage(message) {
return validateMessageInCharCodesList(message, GSM_charCodes);
}
function validateMessageWithShiftTable(message) {
var charCodes = [GSM_charCodes, GSM_TR_charCodes, GSM_ES_charCodes, GSM_PT_charCodes];
for (var i = 0; i < charCodes.length; i++) {
if (validateMessageInCharCodesList(message, charCodes[i]))
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Validating against each shift table char-table independently solves the problem of mixed shift table characters combined within the same message

return true;
}

return false;
}

function validateExtendedCharacter(character) {
return existsInArray(character.charCodeAt(0), GSMe_charCodes);
}
function validateExtendedCharacterWithShiftTable(character) {
var charCodes = GSMe_charCodes.concat(GSMe_TR_charCodes, GSMe_ES_charCodes, GSMe_PT_charCodes);
return existsInArray(character.charCodeAt(0), charCodes);
}

module.exports.validateCharacter = validateCharacter;
module.exports.validateCharacterWithShiftTable = validateCharacterWithShiftTable;
module.exports.validateMessage = validateMessage;
module.exports.validateMessageWithShiftTable = validateMessageWithShiftTable;
module.exports.validateExtendedCharacter = validateExtendedCharacter;
module.exports.validateExtendedCharacterWithShiftTable = validateExtendedCharacterWithShiftTable;
14 changes: 11 additions & 3 deletions lib/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,21 @@ var UNICODE = module.exports.UNICODE = 'Unicode';
var GSM = module.exports.GSM = 'GSM';

module.exports.split = function (message, options) {
var characterset = options && options.characterset;
var characterSet = options && options.characterset;

options = {
summary: options && options.summary
summary: options && options.summary,
supportShiftTables: options && options.supportShiftTables
};

var isGsm = (characterset === undefined && gsmValidator.validateMessage(message)) || characterset === GSM;
function validateMessage(message) {
if (options.supportShiftTables) {
return gsmValidator.validateMessageWithShiftTable(message);
}
return gsmValidator.validateMessage(message);
}

var isGsm = (characterSet === undefined && validateMessage(message)) || characterSet === GSM;
var splitResult, singleBytes, multiBytes, charBytes;

if (isGsm) {
Expand Down
Loading