From 6f44b12410bfbfd5e31a75fcb62549a0691d72aa Mon Sep 17 00:00:00 2001 From: 0x1eef <0x1eef@protonmail.com> Date: Sun, 17 Jul 2022 16:37:36 -0300 Subject: [PATCH] improve the pull-farsi, and pull-portuguese scripts --- bin/json/private/al-quran | 78 +++++++++++++++++ bin/json/pull-farsi | 101 ++------------------- bin/json/pull-portuguese | 102 ++-------------------- bindata/al-quran.cc/chapter-names.json | 116 +++++++++++++++++++++++++ 4 files changed, 206 insertions(+), 191 deletions(-) create mode 100755 bin/json/private/al-quran create mode 100644 bindata/al-quran.cc/chapter-names.json diff --git a/bin/json/private/al-quran b/bin/json/private/al-quran new file mode 100755 index 0000000..82580d4 --- /dev/null +++ b/bin/json/private/al-quran @@ -0,0 +1,78 @@ +#!/usr/bin/env ruby +# frozen_string_literal: true + +## +# This script requests each verse, from each chapter +# in The Qur'an in a language that is determined by +# its first two arguments. For example: +# +# * ./al-quran pt portuguese +# * ./al-quran fa farsi +# * etc.. + +## +# Set process title - primarily for the "ps" command +Process.setproctitle("quran-pull/json/private/al-quran (#{ARGV[1]})") + +## +# Dependencies +require "net/http" +require "nokogiri" +require "json" +require "paint" +require_relative "../../../binlib/io/line" + +## +# Configuration variables +base_uri = "al-quran.cc" +path = "/quran-translation/#{ARGV[1]}/%{chapter}/%{verse}.html" +dest = File.join("src", "json", ARGV[0], "%{chapter}.json") +delay = 0.5 + +## +# Chapter names +names = JSON.parse( + File.read( + File.join('bindata', 'al-quran.cc', 'chapter-names.json') + ) +) + +## +# Map chapters to their verse count. +vmap_path = File.join("bindata", "chapters-length.json") +vmap = JSON.parse(File.read(vmap_path)) + +## +# Share a single Net::HTTP instance. +http = Net::HTTP.new(base_uri, 443) +http.use_ssl = true + +## +# Utils +line = IO::Line.new($stdout) +find_content = ->(res) do + doc = Nokogiri::HTML(res.body) + doc.css(".ayah .translate").text +end + +1.upto(114) do |chapter| + line.rewind.print("Download chapter #{chapter}:").end + rows = [] + vcount = vmap[chapter.to_s] + 1.upto(vcount) do |verse| + res = http.request_get(format(path, chapter: names[chapter - 1], verse:)) + if Net::HTTPOK === res + rows.push([verse, find_content.(res)]) + line.rewind.print("#{verse} of #{vcount} verses downloaded. ") + else + line.end.print(Paint["ABORT", :red, :bold]) + .end.print("Bad response: ", res.class) + .end + exit(1) + end + sleep(delay) + end + File.binwrite(format(dest, chapter:), JSON.pretty_generate(rows)) + line.end.print(Paint["Done.", :bold]) + .end.end +end diff --git a/bin/json/pull-farsi b/bin/json/pull-farsi index 84fa98b..b30aebe 100755 --- a/bin/json/pull-farsi +++ b/bin/json/pull-farsi @@ -7,105 +7,16 @@ # is requested from the https://al-quran.cc website. # # Each chapter is then saved in a JSON file, for example: -# "src/fa/.json" +# "src/json/fa/1.json", "src/json/fa/2.json", etc. ## -# Dependencies -require "net/http" -require "nokogiri" -require "json" -require "paint" +# Set process name - primarily for the "ps" command. +Process.setproctitle("quran-pull/json/pull-farsi") ## -# Chapter names -ch_names = %w[ - al-fatiha al-baqara aal-e-imran an-nisa - al-maeda al-anaam al-araf al-anfal - at-tawba yunus hud yusuf - ar-rad ibrahim al-hijr an-nahl - al-isra al-kahf maryam ta-ha - al-anbiya al-hajj al-mumenoon an-nur - al-furqan ash-shuara an-naml al-qasas - al-ankabut ar-rum luqman as-sajda - al-ahzab saba fatir ya-seen - as-saaffat sad az-zamar ghafir - fussilat ash-shura az-zukhruf ad-dukhan - al-jathiya al-ahqaf muhammad al-fath - al-hujraat qaf adh-dhariyat at-tur - an-najm al-qamar ar-rahman al-waqia - al-hadid al-mujadala al-hashr al-mumtahana - as-saff al-jumua al-munafiqoon at-taghabun - at-talaq at-tahrim al-mulk al-qalam - al-haaqqa al-maarij nooh al-jinn - al-muzzammil al-muddaththir al-qiyama al-insan - al-mursalat an-naba an-naziat abasa - at-takwir al-infitar al-mutaffifin al-inshiqaq - al-burooj at-tariq al-ala al-ghashiya - al-fajr al-balad ash-shams al-lail - ad-dhuha al-sharh at-tin al-alaq - al-qadr al-bayyina az-zalzala al-adiyat - al-qaria at-takathur al-asr al-humaza - al-fil quraish al-maun al-kauther - al-kafiroon an-nasr al-masadd al-ikhlas - al-falaq an-nas -] - -## -# Configuration variables. -base_uri = "al-quran.cc" -path = "/quran-translation/farsi/%{ch_name}/%{verse_num}.html" -cool_off = 5 -src_path = File.join( - __dir__, "..", "src", "json", "ar", "%{chapter_num}.json" -) -dest_path = File.join( - __dir__, "..", "src", "json", "fa", "%{chapter_num}.json" +# Spawn bin/json/private/al-quran +Process.wait Process.spawn( + "./bin/json/private/al-quran", "fa", "farsi" ) -## -# Share a single Net::HTTP instance. -http = Net::HTTP.new(base_uri, 443) -http.use_ssl = true -## -# Helper method. -def get_request(path) - Net::HTTP::Get.new( - path, - "Accept" => "text/html" - ) -end - -## -# Helper method. -def extract_verse!(res) - doc = Nokogiri::HTML(res.body) - verse = doc.css(".ayah .translate").first - verse.text -end - -## -# main() -1.upto(114) do |chapter_num| - verses = JSON.parse File.read(format(src_path, chapter_num:)) - rows = [] - ch_name = ch_names[chapter_num - 1] - final_dest = format(dest_path, chapter_num:) - - print "Fetch: ", ch_name, "\n" - verses.each.with_index(1) do |_, verse_num| - htm_file = format(path, ch_name:, verse_num:) - case res = http.request(get_request(htm_file)) - when Net::HTTPOK - rows.push([verse_num, extract_verse!(res)]) - else - print Paint["ERROR (#{res.class}): ", :red, :bold], final_dest, "\n" - break - end - sleep 0.1 - end - File.write(final_dest, JSON.pretty_generate(rows)) - print Paint["OK: ", :green, :bold], final_dest.sub(ENV["HOME"], ""), "\n" - sleep cool_off - print Paint["Chill for #{cool_off} seconds", :blue, :bold], "\n", "\n" -end diff --git a/bin/json/pull-portuguese b/bin/json/pull-portuguese index 7ca7c58..e1b7db7 100755 --- a/bin/json/pull-portuguese +++ b/bin/json/pull-portuguese @@ -7,104 +7,14 @@ # is requested from the https://al-quran.cc website. # # Each chapter is then saved in a JSON file, for example: -# "src/pt/.json" +# "src/json/pt/1/.json", "src/json/pt/2.json", etc. ## -# Dependencies -require "net/http" -require "nokogiri" -require "json" -require "paint" +# Set process title - primarily for the "ps" command. +Process.setproctitle("quran-pull/json/pull-portuguese") ## -# Chapter names -ch_names = %w[ - al-fatiha al-baqara aal-e-imran an-nisa - al-maeda al-anaam al-araf al-anfal - at-tawba yunus hud yusuf - ar-rad ibrahim al-hijr an-nahl - al-isra al-kahf maryam ta-ha - al-anbiya al-hajj al-mumenoon an-nur - al-furqan ash-shuara an-naml al-qasas - al-ankabut ar-rum luqman as-sajda - al-ahzab saba fatir ya-seen - as-saaffat sad az-zamar ghafir - fussilat ash-shura az-zukhruf ad-dukhan - al-jathiya al-ahqaf muhammad al-fath - al-hujraat qaf adh-dhariyat at-tur - an-najm al-qamar ar-rahman al-waqia - al-hadid al-mujadala al-hashr al-mumtahana - as-saff al-jumua al-munafiqoon at-taghabun - at-talaq at-tahrim al-mulk al-qalam - al-haaqqa al-maarij nooh al-jinn - al-muzzammil al-muddaththir al-qiyama al-insan - al-mursalat an-naba an-naziat abasa - at-takwir al-infitar al-mutaffifin al-inshiqaq - al-burooj at-tariq al-ala al-ghashiya - al-fajr al-balad ash-shams al-lail - ad-dhuha al-sharh at-tin al-alaq - al-qadr al-bayyina az-zalzala al-adiyat - al-qaria at-takathur al-asr al-humaza - al-fil quraish al-maun al-kauther - al-kafiroon an-nasr al-masadd al-ikhlas - al-falaq an-nas -] - -## -# Configuration variables. -base_uri = "al-quran.cc" -path = "/quran-translation/portuguese/%{ch_name}/%{verse_num}.html" -cool_off = 5 -src_path = File.join( - __dir__, "..", "src", "json", "ar", "%{chapter_num}.json" +# Spawn bin/json/private/al-quran +Process.wait Process.spawn( + "./bin/json/private/al-quran", "pt", "portuguese" ) -dest_path = File.join( - __dir__, "..", "src", "json", "pt", "%{chapter_num}.json" -) - -## -# Share a single Net::HTTP instance. -http = Net::HTTP.new(base_uri, 443) -http.use_ssl = true - -## -# Helper method. -def get_request(path) - Net::HTTP::Get.new( - path, - "Accept" => "text/html" - ) -end - -## -# Helper method. -def extract_verse!(res, remove_suratu) - doc = Nokogiri::HTML(res.body) - verse = doc.css(".ayah .translate").first - (remove_suratu ? verse.text.sub(/^Suratu [\w\-.]+/, "") : verse.text).strip -end - -## -# main() -1.upto(114) do |chapter_num| - verses = JSON.parse File.read(format(src_path, chapter_num:)) - rows = [] - ch_name = ch_names[chapter_num - 1] - final_dest = format(dest_path, chapter_num:) - - print "Fetch: ", ch_name, "\n" - verses.each.with_index(1) do |_, verse_num| - htm_file = format(path, ch_name:, verse_num:) - case res = http.request(get_request(htm_file)) - when Net::HTTPOK - rows.push([verse_num, extract_verse!(res, verse_num == 1)]) - else - print Paint["ERROR (#{res.class}): ", :red, :bold], final_dest, "\n" - break - end - end - File.write(final_dest, JSON.pretty_generate(rows)) - print Paint["OK: ", :green, :bold], final_dest.sub(ENV["HOME"], ""), "\n" - sleep cool_off - print Paint["Chill for #{cool_off} seconds", :blue, :bold], "\n", "\n" -end diff --git a/bindata/al-quran.cc/chapter-names.json b/bindata/al-quran.cc/chapter-names.json new file mode 100644 index 0000000..8904f9c --- /dev/null +++ b/bindata/al-quran.cc/chapter-names.json @@ -0,0 +1,116 @@ +[ + "al-fatiha", + "al-baqara", + "aal-e-imran", + "an-nisa", + "al-maeda", + "al-anaam", + "al-araf", + "al-anfal", + "at-tawba", + "yunus", + "hud", + "yusuf", + "ar-rad", + "ibrahim", + "al-hijr", + "an-nahl", + "al-isra", + "al-kahf", + "maryam", + "ta-ha", + "al-anbiya", + "al-hajj", + "al-mumenoon", + "an-nur", + "al-furqan", + "ash-shuara", + "an-naml", + "al-qasas", + "al-ankabut", + "ar-rum", + "luqman", + "as-sajda", + "al-ahzab", + "saba", + "fatir", + "ya-seen", + "as-saaffat", + "sad", + "az-zamar", + "ghafir", + "fussilat", + "ash-shura", + "az-zukhruf", + "ad-dukhan", + "al-jathiya", + "al-ahqaf", + "muhammad", + "al-fath", + "al-hujraat", + "qaf", + "adh-dhariyat", + "at-tur", + "an-najm", + "al-qamar", + "ar-rahman", + "al-waqia", + "al-hadid", + "al-mujadala", + "al-hashr", + "al-mumtahana", + "as-saff", + "al-jumua", + "al-munafiqoon", + "at-taghabun", + "at-talaq", + "at-tahrim", + "al-mulk", + "al-qalam", + "al-haaqqa", + "al-maarij", + "nooh", + "al-jinn", + "al-muzzammil", + "al-muddaththir", + "al-qiyama", + "al-insan", + "al-mursalat", + "an-naba", + "an-naziat", + "abasa", + "at-takwir", + "al-infitar", + "al-mutaffifin", + "al-inshiqaq", + "al-burooj", + "at-tariq", + "al-ala", + "al-ghashiya", + "al-fajr", + "al-balad", + "ash-shams", + "al-lail", + "ad-dhuha", + "al-sharh", + "at-tin", + "al-alaq", + "al-qadr", + "al-bayyina", + "az-zalzala", + "al-adiyat", + "al-qaria", + "at-takathur", + "al-asr", + "al-humaza", + "al-fil", + "quraish", + "al-maun", + "al-kauther", + "al-kafiroon", + "an-nasr", + "al-masadd", + "al-ikhlas", + "al-falaq", + "an-nas" +] \ No newline at end of file