Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support National Language Shift Tables #16

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 18 additions & 5 deletions lib/gsmsplitter.js
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
var gsmvalidator = require('./gsmvalidator');
var gsmValidator = require('./gsmvalidator');

function isHighSurrogate(code) {
return code >= 0xD800 && code <= 0xDBFF;
}

module.exports.split = function (message, options) {
options = options || { summary: false };
options = options || { supportShiftTables: false, summary: false };

if (message === '') {
return {
Expand All @@ -26,7 +26,6 @@ module.exports.split = function (message, options) {
var totalLength = 0;
var messagePart = '';


function bank() {
var msg = {
content: options.summary ? undefined : messagePart,
Expand All @@ -42,15 +41,29 @@ module.exports.split = function (message, options) {
messagePart = '';
}

function validateCharacter(character) {
if (options.supportShiftTables) {
return gsmValidator.validateCharacterWithShiftTable(character);
}
return gsmValidator.validateCharacter(character);
}

function validateExtendedCharacter(character) {
if (options.supportShiftTables) {
return gsmValidator.validateExtendedCharacterWithShiftTable(character);
}
return gsmValidator.validateExtendedCharacter(character);
}

for (var i = 0, count = message.length; i < count; i++) {
var c = message.charAt(i);

if (!gsmvalidator.validateCharacter(c)) {
if (!validateCharacter(c)) {
if (isHighSurrogate(c.charCodeAt(0))) {
i++;
}
c = '\u0020';
} else if (gsmvalidator.validateExtendedCharacter(c)) {
} else if (validateExtendedCharacter(c)) {
if (bytes === 152) bank();
bytes++;
}
Expand Down
216 changes: 158 additions & 58 deletions lib/gsmvalidator.js
Original file line number Diff line number Diff line change
@@ -1,58 +1,158 @@
// '@£$¥èéùìòÇ\nØø\rÅåΔ_ΦΓΛΩΠΨΣΘΞÆæßÉ\x20!"#¤%&\'()*+,-./0123456789:;<=>?¡ABCDEFGHIJKLMNOPQRSTUVWXYZÄÖÑܧ¿abcdefghijklmnopqrstuvwxyzäöñüà\f^{}\\[~]|€'
var GSM_charCodes = [
10,12,13,32,33,34,35,36,
37,38,39,40,41,42,43,44,
45,46,47,48,49,50,51,52,
53,54,55,56,57,58,59,60,
61,62,63,64,65,66,67,68,
69,70,71,72,73,74,75,76,
77,78,79,80,81,82,83,84,
85,86,87,88,89,90,91,92,
93,94,95,97,98,99,100,101,
102,103,104,105,106,107,108,
109,110,111,112,113,114,115,
116,117,118,119,120,121,122,
123,124,125,126,161,163,164,
165,167,191,196,197,198,199,
201,209,214,216,220,223,224,
228,229,230,232,233,236,241,
242,246,248,249,252,915,916,
920,923,926,928,931,934,936,
937,8364
];

// '\f|^€{}[~]\\'
var GSMe_charCodes = [12,91,92,93,94,123,124,125,126,8364];

function existsInArray(code, array) {
var len = array.length;
var i = 0;
while (i < len) {
var e = array[i];
if (code === e) return true;
i++;
}
return false;
}

function validateCharacter(character) {
var code = character.charCodeAt(0);
return existsInArray(code, GSM_charCodes);
}

function validateMessage(message) {
for (var i = 0; i < message.length; i++) {
if (!validateCharacter(message.charAt(i)))
return false;
}
return true;
}

function validateExtendedCharacter(character) {
var code = character.charCodeAt(0);
return existsInArray(code, GSMe_charCodes);
}

module.exports.validateCharacter = validateCharacter;
module.exports.validateMessage = validateMessage;
module.exports.validateExtendedCharacter = validateExtendedCharacter;
// '@£$¥èéùìòÇ\nØø\rÅåΔ_ΦΓΛΩΠΨΣΘΞÆæßÉ\x20!"#¤%&\'()*+,-./0123456789:;<=>?¡ABCDEFGHIJKLMNOPQRSTUVWXYZÄÖÑܧ¿abcdefghijklmnopqrstuvwxyzäöñüà\f^{}\\[~]|€'
var GSM_charCodes = [
10,12,13,32,33,34,35,36,
37,38,39,40,41,42,43,44,
45,46,47,48,49,50,51,52,
53,54,55,56,57,58,59,60,
61,62,63,64,65,66,67,68,
69,70,71,72,73,74,75,76,
77,78,79,80,81,82,83,84,
85,86,87,88,89,90,91,92,
93,94,95,97,98,99,100,101,
102,103,104,105,106,107,108,
109,110,111,112,113,114,115,
116,117,118,119,120,121,122,
123,124,125,126,161,163,164,
165,167,191,196,197,198,199,
201,209,214,216,220,223,224,
228,229,230,232,233,236,241,
242,246,248,249,252,915,916,
920,923,926,928,931,934,936,
937,8364
];

// '\f|^€{}[~]\\'
var GSMe_charCodes = [12,91,92,93,94,123,124,125,126,8364];

// '@£$¥€éùıòÇ\nĞğ\rÅåΔ_ΦΓΛΩΠΨΣΘΞŞşßÉ\x20!"#¤%&\'()*+,-./0123456789:;<=>?İABCDEFGHIJKLMNOPQRSTUVWXYZÄÖÑܧçabcdefghijklmnopqrstuvwxyzäöñüà\f^{}\[~]|'
var GSM_TR_charCodes = [
10,12,13,32,33,34,35,36,
37,38,39,40,41,42,43,44,
45,46,47,48,49,50,51,52,
53,54,55,56,57,58,59,60,
61,62,63,64,65,66,67,68,
69,70,71,72,73,74,75,76,
77,78,79,80,81,82,83,84,
85,86,87,88,89,90,91,92,
93,94,95,97,98,99,100,101,
102,103,104,105,106,107,108,
109,110,111,112,113,114,115,
116,117,118,119,120,121,122,
123,124,125,126,163,164,165,
167,196,197,199,201,209,214,
220,223,224,228,229,231,233,
241,242,246,249,252,286,287,
304,305,350,351,915,916,920,
923,926,928,931,934,936,937,
8364
];

// '\f^{}\[~]|'
var GSMe_TR_charCodes = [12,91,92,93,94,123,124,125,126,286,287,304,305,350,351,8364];

// '@£$¥èéùìòÇ\nØø\rÅåΔ_ΦΓΛΩΠΨΣΘΞÆæßÉ\x20!"#¤%&\'()*+,-./0123456789:;<=>?¡ABCDEFGHIJKLMNOPQRSTUVWXYZÄÖÑܧ¿abcdefghijklmnopqrstuvwxyzäöñüàç\f^{}\\[~]|ÁÍÓÚá€íóú'
var GSM_ES_charCodes = [
10,12,13,32,33,34,35,36,
37,38,39,40,41,42,43,44,
45,46,47,48,49,50,51,52,
53,54,55,56,57,58,59,60,
61,62,63,64,65,66,67,68,
69,70,71,72,73,74,75,76,
77,78,79,80,81,82,83,84,
85,86,87,88,89,90,91,92,
93,94,95,97,98,99,100,101,
102,103,104,105,106,107,108,
109,110,111,112,113,114,115,
116,117,118,119,120,121,122,
123,124,125,126,161,163,164,
165,167,191,193,196,197,198,
199,201,205,209,211,214,216,
218,220,223,224,225,228,229,
230,231,232,233,236,237,241,
242,243,246,248,249,250,252,
915,916,920,923,926,928,931,
934,936,937,8364
];

// 'ç\f^{}\\[~]|ÁÍÓÚá€íóú'
var GSMe_ES_charCodes = [12,91,92,93,94,123,124,125,126,193,205,211,218,225,231,237,243,250,8364];

// '@£$¥êéúíóç\nÔô\rÁáΔ_ªÇÀ∞^\\€Ó|ÂâÊÉ\x20!"#º%&\'()*+,-./0123456789:;<=>?ÍABCDEFGHIJKLMNOPQRSTUVWXYZÃÕÚܧ~abcdefghijklmnopqrstuvwxyzãõ`üà\fΦΓ^ΩΠΨΣΘ{}\\[~]|'
var GSM_PT_charCodes = [
10,12,13,32,33,34,35,36,
37,38,39,40,41,42,43,44,
45,46,47,48,49,50,51,52,
53,54,55,56,57,58,59,60,
61,62,63,64,65,66,67,68,
69,70,71,72,73,74,75,76,
77,78,79,80,81,82,83,84,
85,86,87,88,89,90,91,92,
93,94,95,96,97,98,99,100,
101,102,103,104,105,106,107,108,
109,110,111,112,113,114,115,116,
117,118,119,120,121,122,123,124,
125,126,163,165,167,170,186,192,
193,194,195,199,201,202,205,211,
212,213,218,220,224,225,226,227,
231,233,234,237,242,243,244,245,
250,252,915,916,920,928,931,934,
936,937,8364,8734
];

// '\fΦΓ^ΩΠΨΣΘ{}\\[~]|'
var GSMe_PT_charCodes = [
12,91,92,93,94,123,124,125,
126,193,194,195,202,205,211,212,
213,218,225,226,227,231,234,237,
242,243,245,250,915,920,928,931,
934,936,937,8364
];

function existsInArray(code, array) {
var len = array.length;
var i = 0;
while (i < len) {
var e = array[i];
if (code === e) return true;
i++;
}
return false;
}

function validateCharacter(character) {
return existsInArray(character.charCodeAt(0), GSM_charCodes);
}
function validateCharacterWithShiftTable(character) {
var charCodes = GSM_charCodes.concat(GSM_TR_charCodes, GSM_ES_charCodes, GSM_PT_charCodes);
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can see how this appears to make sense if all you need to do is count all possible valid characters that exist in any shift table, but this creates an invalid situation where the whole text doesn't fit any common shift table, and should be detected as Unicode. Take the following two messages:

  1. valid in only Portuguese language (Latin script)
  2. Ø valid in either Spanish language (Latin script) or the Basic Character Set

If each example was a full message, they would each be valid. However, if we take a message containing both characters:

∞Ø

As far as we know, this is an invalid message because there is no common shift table that supports both at the same time, so it should be detected as Unicode.

This library's character set auto-detection mechanism is pretty important, and we should think about how that can be preserved.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've updated the PR, when validating a whole message with this method is not used anymore, review message below

return existsInArray(character.charCodeAt(0), charCodes);
}

function validateMessage(message) {
for (var i = 0; i < message.length; i++) {
if (!validateCharacter(message.charAt(i)))
return false;
}
return true;
}
function validateMessageWithShiftTable(message) {
for (var i = 0; i < message.length; i++) {
if (!validateCharacterWithShiftTable(message.charAt(i)))
return false;
}
return true;
}

function validateExtendedCharacter(character) {
return existsInArray(character.charCodeAt(0), GSMe_charCodes);
}
function validateExtendedCharacterWithShiftTable(character) {
var charCodes = GSMe_charCodes.concat(GSMe_TR_charCodes, GSMe_ES_charCodes, GSMe_PT_charCodes);
return existsInArray(character.charCodeAt(0), charCodes);
}

module.exports.validateCharacter = validateCharacter;
module.exports.validateCharacterWithShiftTable = validateCharacterWithShiftTable;
module.exports.validateMessage = validateMessage;
module.exports.validateMessageWithShiftTable = validateMessageWithShiftTable;
module.exports.validateExtendedCharacter = validateExtendedCharacter;
module.exports.validateExtendedCharacterWithShiftTable = validateExtendedCharacterWithShiftTable;
14 changes: 11 additions & 3 deletions lib/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,21 @@ var UNICODE = module.exports.UNICODE = 'Unicode';
var GSM = module.exports.GSM = 'GSM';

module.exports.split = function (message, options) {
var characterset = options && options.characterset;
var characterSet = options && options.characterset;

options = {
summary: options && options.summary
summary: options && options.summary,
supportShiftTables: options && options.supportShiftTables
};

var isGsm = (characterset === undefined && gsmValidator.validateMessage(message)) || characterset === GSM;
function validateMessage(message) {
if (options.supportShiftTables) {
return gsmValidator.validateMessageWithShiftTable(message);
}
return gsmValidator.validateMessage(message);
}

var isGsm = (characterSet === undefined && validateMessage(message)) || characterSet === GSM;
var splitResult, singleBytes, multiBytes, charBytes;

if (isGsm) {
Expand Down
71 changes: 70 additions & 1 deletion test/gsmvalidator.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,36 @@ describe('GSM Validator', function () {

});

describe('Validating a message of every valid GSM turkish characters', function () {

it('should return true', function () {
var message = '@£$¥€éùıòÇ\nĞğ\rÅåΔ_ΦΓΛΩΠΨΣΘΞŞşßÉ\x20!"#¤%&\'()*+,-./0123456789:;<=>?İABCDEFGHIJKLMNOPQRSTUVWXYZÄÖÑܧçabcdefghijklmnopqrstuvwxyzäöñüà\f^{}\[~]|';
var result = gsmValidator.validateMessageWithShiftTable(message);
assert.strictEqual(result, true);
});

});

describe('Validating a message of every valid GSM spanish characters', function () {

it('should return true', function () {
var message = '@£$¥èéùìòÇ\nØø\rÅåΔ_ΦΓΛΩΠΨΣΘΞÆæßÉ\x20!"#¤%&\'()*+,-./0123456789:;<=>?¡ABCDEFGHIJKLMNOPQRSTUVWXYZÄÖÑܧ¿abcdefghijklmnopqrstuvwxyzäöñüàç\f^{}\\[~]|ÁÍÓÚá€íóú';
var result = gsmValidator.validateMessageWithShiftTable(message);
assert.strictEqual(result, true);
});

});

describe('Validating a message of every valid GSM portuguese characters', function () {

it('should return true', function () {
var message = '@£$¥êéúíóç\nÔô\rÁáΔ_ªÇÀ∞^\\€Ó|ÂâÊÉ\x20!"#º%&\'()*+,-./0123456789:;<=>?ÍABCDEFGHIJKLMNOPQRSTUVWXYZÃÕÚܧ~abcdefghijklmnopqrstuvwxyzãõ`üà\fΦΓ^ΩΠΨΣΘ{}\\[~]|';
var result = gsmValidator.validateMessageWithShiftTable(message);
assert.strictEqual(result, true);
});

});

describe('Validating a message of one GSM character', function () {

it('should return true', function () {
Expand Down Expand Up @@ -47,6 +77,45 @@ describe('GSM Validator', function () {

});

describe('Validating all GSM turkish characters', function () {

var gsm = '@£$¥€éùıòÇ\nĞğ\rÅåΔ_ΦΓΛΩΠΨΣΘΞŞşßÉ\x20!"#¤%&\'()*+,-./0123456789:;<=>?İABCDEFGHIJKLMNOPQRSTUVWXYZÄÖÑܧçabcdefghijklmnopqrstuvwxyzäöñüà\f^{}\[~]|';

it('should return true', function () {
for (var i = 0; i < gsm.length; i++) {
var result = gsmValidator.validateCharacterWithShiftTable(gsm[i]);
assert.strictEqual(result, true, 'checking character "' + gsm[i] + '"');
}
});

});

describe('Validating all GSM spanish characters', function () {

var gsm = '@£$¥èéùìòÇ\nØø\rÅåΔ_ΦΓΛΩΠΨΣΘΞÆæßÉ\x20!"#¤%&\'()*+,-./0123456789:;<=>?¡ABCDEFGHIJKLMNOPQRSTUVWXYZÄÖÑܧ¿abcdefghijklmnopqrstuvwxyzäöñüàç\f^{}\\[~]|ÁÍÓÚá€íóú';

it('should return true', function () {
for (var i = 0; i < gsm.length; i++) {
var result = gsmValidator.validateCharacterWithShiftTable(gsm[i]);
assert.strictEqual(result, true, 'checking character "' + gsm[i] + '"');
}
});

});

describe('Validating all GSM portuguese characters', function () {

var gsm = '@£$¥êéúíóç\nÔô\rÁáΔ_ªÇÀ∞^\\€Ó|ÂâÊÉ\x20!"#º%&\'()*+,-./0123456789:;<=>?ÍABCDEFGHIJKLMNOPQRSTUVWXYZÃÕÚܧ~abcdefghijklmnopqrstuvwxyzãõ`üà\fΦΓ^ΩΠΨΣΘ{}\\[~]|';

it('should return true', function () {
for (var i = 0; i < gsm.length; i++) {
var result = gsmValidator.validateCharacterWithShiftTable(gsm[i]);
assert.strictEqual(result, true, 'checking character "' + gsm[i] + '"');
}
});

});

describe('Validating a non-GSM character', function () {

it('should return false', function () {
Expand Down Expand Up @@ -74,4 +143,4 @@ describe('GSM Validator', function () {

});

});
});