Merge pull request #249 from razorpay/release/2.0.1

Release/2.0.1
razorpay · Sep 15, 2021 · 32979f0 · 32979f0
2 parents c940f0c + af70796
commit 32979f0
Show file tree

Hide file tree

Showing 9 changed files with 140 additions and 106 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [UNRELEASED][unreleased]
 
+## [2.0.1][2.0.1]
+## Changed
+- Metadata Changes
+
 ## [2.0.0][2.0.0]
 ### Removed
 - Removed support for Elixir package
@@ -219,7 +223,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 -   Removes some data formats (YAML/Large JSON) for cleaner code. If you were using them, please let create an issue.
 
-[unreleased]: https://github.com/razorpay/ifsc/compare/2.0.0...HEAD
+[unreleased]: https://github.com/razorpay/ifsc/compare/2.0.1...HEAD
+[2.0.1]: https://github.com/razorpay/ifsc/releases/tag/2.0.1
 [2.0.0]: https://github.com/razorpay/ifsc/releases/tag/2.0.0
 [1.6.1]: https://github.com/razorpay/ifsc/releases/tag/1.6.1
 [1.5.13]: https://github.com/razorpay/ifsc/releases/tag/1.5.13

diff --git a/ifsc.gemspec b/ifsc.gemspec
@@ -3,8 +3,8 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
 
 Gem::Specification.new do |s|
   s.name        = 'ifsc'
-  s.version     = '2.0.0'
-  s.date        = '2021-08-12'
+  s.version     = '2.0.1'
+  s.date        = '2021-09-13'
   s.summary     = 'IFSC code database to help you validate IFSC codes'
   s.description = 'A simple gem by @razorpay to help you validate your IFSC codes. IFSC codes are bank codes within India'
   s.authors     = ['Abhay Rana', 'Nihal Gonsalves']

diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "ifsc",
-  "version": "2.0.0",
+  "version": "2.0.1",
   "description": "This is part of the IFSC toolset released by Razorpay. You can find more details about the entire release at [ifsc.razorpay.com](https://ifsc.razorpay.com). Includes only a validation library as of now.",
   "main": "src/node/index.js",
   "directories": {

diff --git a/scraper/Gemfile.lock b/scraper/Gemfile.lock
@@ -1,12 +1,12 @@
 GEM
   remote: https://rubygems.org/
   specs:
-    httparty (0.18.1)
+    httparty (0.19.0)
       mime-types (~> 3.0)
       multi_xml (>= 0.5.2)
     mime-types (3.3.1)
       mime-types-data (~> 3.2015)
-    mime-types-data (3.2021.0704)
+    mime-types-data (3.2021.0901)
     mini_portile2 (2.6.1)
     multi_xml (0.6.0)
     nokogiri (1.12.4)

diff --git a/scraper/scripts/bootstrap.sh b/scraper/scripts/bootstrap.sh
@@ -11,8 +11,8 @@ else
   # Till NPCI fixes their certificate: https://twitter.com/captn3m0/status/1247806778529599496
   wget --no-verbose --timeout=30 "https://www.npci.org.in/what-we-do/nach/live-members/live-banks" --output-document=nach.html --user-agent="Firefox"
   wget --no-verbose --timeout=30 "https://www.npci.org.in/what-we-do/upi/live-members" --output-document=upi.html --user-agent="Firefox"
-  wget --timestamping --no-verbose --directory-prefix=sheets/ "https://rbidocs.rbi.org.in/rdocs/content/docs/68774.xlsx" || true
-  wget --timestamping --no-verbose --directory-prefix=sheets/ "https://rbidocs.rbi.org.in/rdocs/RTGS/DOCs/RTGEB0815.xlsx" || true
+  wget --timestamping --no-verbose --directory-prefix=sheets/ "https://rbidocs.rbi.org.in/rdocs/content/docs/68774.xlsx"
+  wget --timestamping --no-verbose --directory-prefix=sheets/ "https://rbidocs.rbi.org.in/rdocs/RTGS/DOCs/RTGEB0815.xlsx"
 
   echo "Sheet Download complete, starting export"
 fi

diff --git a/scraper/scripts/generate.rb b/scraper/scripts/generate.rb
@@ -10,10 +10,11 @@
 imps = parse_imps(banks)
 log "[NPCI] Got #{imps.keys.size} entries"
 
-rtgs = parse_rtgs(banks)
+# The first sheet on RTGS gives summary numbers, which we ignore
+rtgs = parse_csv(['RTGS-1', 'RTGS-2'], banks, {"RTGS"=> true})
 log "[RTGS] Got #{rtgs.keys.size} entries"
 
-neft = parse_neft(banks)
+neft = parse_csv(['NEFT-0', 'NEFT-1'], banks, {"NEFT"=> true})
 log "[NEFT] Got #{neft.keys.size} entries"
 
 log 'Combining the above 3 lists'

diff --git a/scraper/scripts/methods.rb b/scraper/scripts/methods.rb
@@ -21,28 +21,6 @@
   STATE
 ].freeze
 
-# These are not all the known states
-# because the usage is quite limited for now
-# TODO: Change this to a accurate mapping
-# And store statecode instead
-KNOWN_STATES = [
-  'ANDHRA PRADESH',
-  'DELHI',
-  'GUJARAT',
-  'JAMMU AND KASHMIR',
-  'HIMACHAL PRADESH',
-  'KARNATAKA',
-  'KERALA',
-  'MAHARASHTRA',
-  'PUNJAB',
-  'TAMIL NADU',
-  'MADHYA PRADESH',
-  'UTTARAKHAND',
-  'RAJASTHAN',
-  'TELANGANA',
-  'WEST BENGAL'
-].freeze
-
 def parse_imps(banks)
   data = {}
   banknames = JSON.parse File.read('../../src/banknames.json')
@@ -66,51 +44,74 @@ def parse_imps(banks)
   data
 end
 
-def parse_neft(banks)
-  data = {}
-  codes = Set.new
-  sheets = 0..1
-  sheets.each do |sheet_id|
-    row_index = 0
-    headings = []
-    log "Parsing #NEFT-#{sheet_id}.csv"
-    headers = CSV.foreach("sheets/NEFT-#{sheet_id}.csv", encoding: 'utf-8', return_headers: false, headers: true, skip_blanks: true) do |row|
-      row = row.to_h
-      scan_contact = row['CONTACT'].to_s.gsub(/[\s-]/, '').scan(/^(\d+)\D?/).last
-      row['CONTACT'] = parse_contact(row['STD CODE'], row['CONTACT'])
-
-      row['MICR'] = row['MICR CODE']
-      row.delete 'MICR CODE'
-      row.delete 'STD CODE'
-
-      row['ADDRESS'] = sanitize(row['ADDRESS'])
-      row['BRANCH'] = sanitize(row['BRANCH'])
-      row['STATE'] = sanitize(row['STATE'])
-      row['DISTRICT'] = sanitize(row['DISTRICT'])
-      row['CITY'] = sanitize(row['CITY'])
-
-      row['IFSC'] = row['IFSC'].upcase.gsub(/[^0-9A-Za-z]/, '')
-      codes.add row['IFSC']
-      row['NEFT'] = true
+# TODO: Return state/UT ISO code and use that instead
+def fix_state!(row)
+  possible_state = row['STATE'].upcase
+  final_state = nil
+  map = {
+    /ANDAMAN/ => 'ANDAMAN AND NICOBAR ISLAND',
+    /BANGALORE/ => 'KARNATAKA',
+    /BARDEZ/ => 'GOA',
+    /BHUSAWAL/ => 'MAHARASHTRA',
+    /BTM/ => 'KARNATAKA',
+    /BULDHANA/ => 'MAHARASHTRA',
+    /BUNDI/ => 'RAJASTHAN',
+    /RAJAS/ => 'RAJASTHAN',
+    /CARMELARAM/ => 'KARNATAKA',
+    # Chandigarh is not a state, but the branches there are ambigous b/w Haryana and Punjab
+    # /CHANDIGARH/ => 'PUNJAB',
+    /CHEMBUR/ => 'PUNJAB',
+    /CHENNAI/ => 'TAMIL NADU',
+    /CHHATIS/ => 'CHHATTISGARH',
+    /CHHATISHGARH/ => 'CHHATTISGARH',
+    /DADRA/ => 'DADRA AND NAGAR HAVELI AND DAMAN AND DIU',
+    /DAHEGAM/ => 'GUJARAT',
+    /DAHEJ/ => 'GUJARAT',
+    # Do not use DAMAN as that clashes with ANDAMAN
+    /DIU/ => 'DADRA AND NAGAR HAVELI AND DAMAN AND DIU',
+    /DELHI/ => 'DELHI',
+    /DINDORI/ => 'MADHYA PRADESH',
+    /DIU/ => 'DADRA AND NAGAR HAVELI AND DAMAN AND DIU',
+    /GOA/ => 'GOA',
+    /HIMACHAL/ => 'HIMACHAL PRADESH',
+    /HYDERABAD/ => 'ANDHRA PRADESH',
+    /IDAR/ => 'ANDHRA PRADESH',
+    /INDORE/ => 'MADHYA PRADESH',
+    /JAMMU/ => 'JAMMU AND KASHMIR',
+    /MADURAI/ => 'TAMIL NADU',
+    /MALEGAON/ => 'MAHARASHTRA',
+    /MUMBAI/ => 'MAHARASHTRA',
+    /NASHIK/ => 'MAHARASHTRA',
+    /NASIK/ => 'MAHARASHTRA',
+    /PONDICHERRY/ => 'PUDUCHERRY',
+    /SAMBRA/ => 'KARNATAKA',
+    /SANTACRUZ/ => 'MAHARASHTRA',
+    /TAMIL/ => 'TAMIL NADU',
+    /UTTARA/ => 'UTTARAKHAND',
+    /UTTARPRADESH/ => 'UTTAR PRADESH',
+    /WEST/ => 'WEST BENGAL',
+    /CHURU/ => 'RAJASTHAN'
+  }
+  map.each_pair do |r, state|
+    if r.match? possible_state
+      final_state = state
+    end
+  end
 
-      # This hopefully is merged-in from RTGS dataset
-      row['CENTRE'] = nil
-      bankcode = row['IFSC'][0..3]
+  if possible_state.size == 2
+    final_state = {
+      "AP" => "ANDHRA PRADESH",
+      "KA" => "KARNATAKA",
+      "TN" => "TELANGANA",
+      "MH" => "MAHARASHTRA",
+      "CG" => "CHHATTISGARH",
 
-      if banks[bankcode] and banks[bankcode].key? :upi and banks[bankcode][:upi]
-        row['UPI'] = true
-      else
-        row['UPI'] = false
-      end
-
-      if data.key? row['IFSC']
-        "Second Entry found for #{row['IFSC']}, discarding"
-        next
-      end
-      data[row['IFSC']] = row
-    end
+    }[possible_state]
+  end
+  if final_state and final_state != row['STATE']
+    log "#{row['IFSC']}: Setting State=(#{final_state}) instead of (#{possible_state})"
+    row['STATE'] = final_state
   end
-  data
 end
 
 # Parses the contact details on the RTGS Sheet
@@ -124,37 +125,58 @@ def parse_contact(std_code, phone)
 
   # If std code starts with 0, strip that out
   if std_code and std_code[0] == '0'
-    std_code = std_code[1..]
+    std_code = std_code[1..-1]
   end
 
   # If we have an STD code, use it correctly
   # Formatting as per E.164 format
   # https://en.wikipedia.org/wiki/E.164
   # if possible
-  if std_code
-    return "+91#{std_code}#{contact}"
-  # If it looks like a mobile number
-  elsif contact and contact.size > 9
+  if std_code == '91'
+    return "+#{std_code}#{contact}"
+  # Toll free number
+  elsif contact and contact[0..3]=='1800'
+    return "+91022#{contact}"
+  # Mobile Number
+  elsif contact and contact.size == 10
     return "+91#{contact}"
+  # STD codes can't be 5 digits long, so this is likely a mobile number split into two
+  elsif std_code and contact and std_code.size==5 and contact.size==5 and ["6","7","8","9"].include? std_code[0]
+    return "+91#{std_code}#{contact}"
+  # We likely have a good enough STD code
+  elsif std_code
+    return "+91#{std_code}#{contact}"
   # This is a local number but we don't have a STD code
   # So we return the local number as-is
+  # TODO: Try to guess the STD code from PIN/Address/State perhaps?
   elsif contact
     return contact
   else
     return nil
   end
 end
 
-def parse_rtgs(banks)
+def parse_csv(files, banks, additional_attributes = {})
   data = {}
-  sheets = 1..2
-  sheets.each do |sheet_id|
+
+  files.each do |file|
     row_index = 0
     headings = []
-    log "Parsing #RTGS-#{sheet_id}.csv"
-    headers = CSV.foreach("sheets/RTGS-#{sheet_id}.csv", encoding: 'utf-8', return_headers: false, headers: true, skip_blanks: true) do |row|
+    log "Parsing #{file}"
+    headers = CSV.foreach("sheets/#{file}.csv", encoding: 'utf-8', return_headers: false, headers: true, skip_blanks: true) do |row|
       row = row.to_h
 
+
+      # Some column is missing, and the STATE column has shifted by one.
+      if row['STATE'].to_s.strip.match('\d')
+        fix_row_alignment!(row)
+      end
+
+      # The address somehow contains a pipe-delimited value for other columns
+      if row['ADDRESS'].count('|') > 2
+        fix_pipe_delimited_address!(row)
+      end
+
       micr_match = row['MICR'].to_s.strip.match('\d{9}')
 
       if micr_match
@@ -163,10 +185,6 @@ def parse_rtgs(banks)
         row['MICR'] = nil
       end
 
-      if row['STATE'].to_s.strip.match('\d')
-        row = fix_row_alignment_for_rtgs(row)
-      end
-
       row['CONTACT'] = parse_contact(row['STD CODE'], row['PHONE'])
 
       # There is a second header in the middle of the sheet.
@@ -191,18 +209,23 @@ def parse_rtgs(banks)
       end
 
       if data.key? row['IFSC']
+        # TODO: Put a diff in the logs?
         log "Second Entry found for #{row['IFSC']}, discarding", :warn
         next
       end
+
       row['ADDRESS'] = sanitize(row['ADDRESS'])
       row['BRANCH'] = sanitize(row['BRANCH'])
-      row['RTGS'] = true
+      fix_state!(row)
+
+      row.merge!(additional_attributes)
       # This isn't accurate sadly, because RBI has both the columns
       # all over the place. As an example, check LAVB0000882 vs LAVB0000883
       # which have the flipped values for CITY1 and CITY2
       row['CITY'] = sanitize(row['CITY2'])
       row['CENTRE'] = sanitize(row['CITY1'])
       row['DISTRICT'] = sanitize(row['CITY1'])
+
       # Delete rows we don't want in output
       # Merged into CONTACRT
       row.delete('STD CODE')
@@ -217,7 +240,7 @@ def parse_rtgs(banks)
 
 def export_csv(data)
   CSV.open('data/IFSC.csv', 'wb') do |csv|
-    keys = ['BANK','IFSC','BRANCH','CENTRE','DISTRICT','STATE','ADDRESS','CONTACT','IMPS','RTGS','CITY','NEFT','MICR','UPI', 'SWIFT']
+    keys = ['BANK','IFSC','BRANCH','CENTRE','DISTRICT','STATE','ADDRESS','CONTACT','IMPS','RTGS','CITY','NEFT','MICR','UPI','SWIFT']
     csv << keys
     data.each do |code, ifsc_data|
       sorted_data = []

diff --git a/scraper/scripts/utils.rb b/scraper/scripts/utils.rb
@@ -2,12 +2,14 @@
 
 def sanitize(str)
   return nil if str.nil? or str.length==0
-  ["┬ô", "┬û",'┬ö','┬Æ','┬á','┬æ','┬ù','ý','ý','┬á'].each do |pattern|
+  ["┬ô", "┬û",'┬ö','┬Æ','┬á','┬æ','┬ù','ý','ý','┬á','Â'].each do |pattern|
     str.gsub!(pattern,' ')
   end
   str.gsub!('├ë','e')
   str.gsub!('├å','a')
   str.gsub!('├ë','e')
+  str.gsub!('`',"'")
+  str.gsub!('Ã½'," ")
   # replace newlines
   str.gsub!("\n", " ")
   # Remove all spaces (including nbsp) at the start and end of the string
@@ -17,20 +19,23 @@ def sanitize(str)
 # Some rows have last 2 columns shifted by 2
 # Check for numeric values of STATE in RTGEB0815.xlsx for examples
 # This checks and fixes those
-def fix_row_alignment_for_rtgs(row)
-  # List of recognized states
-  unless KNOWN_STATES.include? row['CITY2'].to_s.strip
-    log "#{row['IFSC']} has an unknown state (#{row['CITY2']}), please check"
-    exit 1
-  end
-  # Start right shifting from the right-most column
-  row['PHONE'] = row['STD CODE']
-  # Move STATE's numeric value to STD CODE
-  row['STD CODE'] = row['STATE']
-  row['STATE'] = row['CITY2']
-  # Fix CITY2 value by duplicating CITY1
-  row['CITY2'] = row['CITY1']
-  return row
+def fix_row_alignment!(row)
+  log "#{row['IFSC']}: Using State = '#{row['CITY2']}' STD_CODE=#{row['STATE']}, PHONE=#{row['STD CODE']} and discarding PHONE=#{row['PHONE']}", :info
+  row['STATE'],row['STD CODE'], row['PHONE'] = row['CITY2'], row['STATE'], row['STD CODE']
+end
+
+def fix_pipe_delimited_address!(row)
+  log "Splitting address= #{row['ADDRESS']}. New values=", :info
+
+  d = row['ADDRESS'].split '|'
+
+  row['PHONE'] =  d[-1]
+  row['STD CODE'] =  d[-2]
+  row['STATE'] =  d[-3]
+  row['CITY2'] =  d[-4]
+  row['CITY1'] =  d[-5]
+  row['ADDRESS'] =  d[-6]
+  log row.select{|k,v| ['ADDRESS','PHONE', 'STD CODE', 'STATE', 'CITY1', 'CITY2'].include? k}, :info
 end
 
 def bank_name_from_code(code)

diff --git a/src/IFSC.json b/src/IFSC.json