improve the pull-farsi, and pull-portuguese scripts

This commit is contained in:
0x1eef 2022-07-17 16:37:36 -03:00
parent a077db4b7f
commit 6f44b12410
4 changed files with 206 additions and 191 deletions

78
bin/json/private/al-quran Executable file
View file

@ -0,0 +1,78 @@
#!/usr/bin/env ruby
# frozen_string_literal: true
##
# This script requests each verse, from each chapter
# in The Qur'an in a language that is determined by
# its first two arguments. For example:
#
# * ./al-quran pt portuguese
# * ./al-quran fa farsi
# * etc..
##
# Set process title - primarily for the "ps" command
Process.setproctitle("quran-pull/json/private/al-quran (#{ARGV[1]})")
##
# Dependencies
require "net/http"
require "nokogiri"
require "json"
require "paint"
require_relative "../../../binlib/io/line"
##
# Configuration variables
base_uri = "al-quran.cc"
path = "/quran-translation/#{ARGV[1]}/%{chapter}/%{verse}.html"
dest = File.join("src", "json", ARGV[0], "%{chapter}.json")
delay = 0.5
##
# Chapter names
names = JSON.parse(
File.read(
File.join('bindata', 'al-quran.cc', 'chapter-names.json')
)
)
##
# Map chapters to their verse count.
vmap_path = File.join("bindata", "chapters-length.json")
vmap = JSON.parse(File.read(vmap_path))
##
# Share a single Net::HTTP instance.
http = Net::HTTP.new(base_uri, 443)
http.use_ssl = true
##
# Utils
line = IO::Line.new($stdout)
find_content = ->(res) do
doc = Nokogiri::HTML(res.body)
doc.css(".ayah .translate").text
end
1.upto(114) do |chapter|
line.rewind.print("Download chapter #{chapter}:").end
rows = []
vcount = vmap[chapter.to_s]
1.upto(vcount) do |verse|
res = http.request_get(format(path, chapter: names[chapter - 1], verse:))
if Net::HTTPOK === res
rows.push([verse, find_content.(res)])
line.rewind.print("#{verse} of #{vcount} verses downloaded. ")
else
line.end.print(Paint["ABORT", :red, :bold])
.end.print("Bad response: ", res.class)
.end
exit(1)
end
sleep(delay)
end
File.binwrite(format(dest, chapter:), JSON.pretty_generate(rows))
line.end.print(Paint["Done.", :bold])
.end.end
end

View file

@ -7,105 +7,16 @@
# is requested from the https://al-quran.cc website.
#
# Each chapter is then saved in a JSON file, for example:
# "src/fa/<chapter_num>.json"
# "src/json/fa/1.json", "src/json/fa/2.json", etc.
##
# Dependencies
require "net/http"
require "nokogiri"
require "json"
require "paint"
# Set process name - primarily for the "ps" command.
Process.setproctitle("quran-pull/json/pull-farsi")
##
# Chapter names
ch_names = %w[
al-fatiha al-baqara aal-e-imran an-nisa
al-maeda al-anaam al-araf al-anfal
at-tawba yunus hud yusuf
ar-rad ibrahim al-hijr an-nahl
al-isra al-kahf maryam ta-ha
al-anbiya al-hajj al-mumenoon an-nur
al-furqan ash-shuara an-naml al-qasas
al-ankabut ar-rum luqman as-sajda
al-ahzab saba fatir ya-seen
as-saaffat sad az-zamar ghafir
fussilat ash-shura az-zukhruf ad-dukhan
al-jathiya al-ahqaf muhammad al-fath
al-hujraat qaf adh-dhariyat at-tur
an-najm al-qamar ar-rahman al-waqia
al-hadid al-mujadala al-hashr al-mumtahana
as-saff al-jumua al-munafiqoon at-taghabun
at-talaq at-tahrim al-mulk al-qalam
al-haaqqa al-maarij nooh al-jinn
al-muzzammil al-muddaththir al-qiyama al-insan
al-mursalat an-naba an-naziat abasa
at-takwir al-infitar al-mutaffifin al-inshiqaq
al-burooj at-tariq al-ala al-ghashiya
al-fajr al-balad ash-shams al-lail
ad-dhuha al-sharh at-tin al-alaq
al-qadr al-bayyina az-zalzala al-adiyat
al-qaria at-takathur al-asr al-humaza
al-fil quraish al-maun al-kauther
al-kafiroon an-nasr al-masadd al-ikhlas
al-falaq an-nas
]
##
# Configuration variables.
base_uri = "al-quran.cc"
path = "/quran-translation/farsi/%{ch_name}/%{verse_num}.html"
cool_off = 5
src_path = File.join(
__dir__, "..", "src", "json", "ar", "%{chapter_num}.json"
)
dest_path = File.join(
__dir__, "..", "src", "json", "fa", "%{chapter_num}.json"
# Spawn bin/json/private/al-quran
Process.wait Process.spawn(
"./bin/json/private/al-quran", "fa", "farsi"
)
##
# Share a single Net::HTTP instance.
http = Net::HTTP.new(base_uri, 443)
http.use_ssl = true
##
# Helper method.
def get_request(path)
Net::HTTP::Get.new(
path,
"Accept" => "text/html"
)
end
##
# Helper method.
def extract_verse!(res)
doc = Nokogiri::HTML(res.body)
verse = doc.css(".ayah .translate").first
verse.text
end
##
# main()
1.upto(114) do |chapter_num|
verses = JSON.parse File.read(format(src_path, chapter_num:))
rows = []
ch_name = ch_names[chapter_num - 1]
final_dest = format(dest_path, chapter_num:)
print "Fetch: ", ch_name, "\n"
verses.each.with_index(1) do |_, verse_num|
htm_file = format(path, ch_name:, verse_num:)
case res = http.request(get_request(htm_file))
when Net::HTTPOK
rows.push([verse_num, extract_verse!(res)])
else
print Paint["ERROR (#{res.class}): ", :red, :bold], final_dest, "\n"
break
end
sleep 0.1
end
File.write(final_dest, JSON.pretty_generate(rows))
print Paint["OK: ", :green, :bold], final_dest.sub(ENV["HOME"], ""), "\n"
sleep cool_off
print Paint["Chill for #{cool_off} seconds", :blue, :bold], "\n", "\n"
end

View file

@ -7,104 +7,14 @@
# is requested from the https://al-quran.cc website.
#
# Each chapter is then saved in a JSON file, for example:
# "src/pt/<chapter_num>.json"
# "src/json/pt/1/.json", "src/json/pt/2.json", etc.
##
# Dependencies
require "net/http"
require "nokogiri"
require "json"
require "paint"
# Set process title - primarily for the "ps" command.
Process.setproctitle("quran-pull/json/pull-portuguese")
##
# Chapter names
ch_names = %w[
al-fatiha al-baqara aal-e-imran an-nisa
al-maeda al-anaam al-araf al-anfal
at-tawba yunus hud yusuf
ar-rad ibrahim al-hijr an-nahl
al-isra al-kahf maryam ta-ha
al-anbiya al-hajj al-mumenoon an-nur
al-furqan ash-shuara an-naml al-qasas
al-ankabut ar-rum luqman as-sajda
al-ahzab saba fatir ya-seen
as-saaffat sad az-zamar ghafir
fussilat ash-shura az-zukhruf ad-dukhan
al-jathiya al-ahqaf muhammad al-fath
al-hujraat qaf adh-dhariyat at-tur
an-najm al-qamar ar-rahman al-waqia
al-hadid al-mujadala al-hashr al-mumtahana
as-saff al-jumua al-munafiqoon at-taghabun
at-talaq at-tahrim al-mulk al-qalam
al-haaqqa al-maarij nooh al-jinn
al-muzzammil al-muddaththir al-qiyama al-insan
al-mursalat an-naba an-naziat abasa
at-takwir al-infitar al-mutaffifin al-inshiqaq
al-burooj at-tariq al-ala al-ghashiya
al-fajr al-balad ash-shams al-lail
ad-dhuha al-sharh at-tin al-alaq
al-qadr al-bayyina az-zalzala al-adiyat
al-qaria at-takathur al-asr al-humaza
al-fil quraish al-maun al-kauther
al-kafiroon an-nasr al-masadd al-ikhlas
al-falaq an-nas
]
##
# Configuration variables.
base_uri = "al-quran.cc"
path = "/quran-translation/portuguese/%{ch_name}/%{verse_num}.html"
cool_off = 5
src_path = File.join(
__dir__, "..", "src", "json", "ar", "%{chapter_num}.json"
# Spawn bin/json/private/al-quran
Process.wait Process.spawn(
"./bin/json/private/al-quran", "pt", "portuguese"
)
dest_path = File.join(
__dir__, "..", "src", "json", "pt", "%{chapter_num}.json"
)
##
# Share a single Net::HTTP instance.
http = Net::HTTP.new(base_uri, 443)
http.use_ssl = true
##
# Helper method.
def get_request(path)
Net::HTTP::Get.new(
path,
"Accept" => "text/html"
)
end
##
# Helper method.
def extract_verse!(res, remove_suratu)
doc = Nokogiri::HTML(res.body)
verse = doc.css(".ayah .translate").first
(remove_suratu ? verse.text.sub(/^Suratu [\w\-.]+/, "") : verse.text).strip
end
##
# main()
1.upto(114) do |chapter_num|
verses = JSON.parse File.read(format(src_path, chapter_num:))
rows = []
ch_name = ch_names[chapter_num - 1]
final_dest = format(dest_path, chapter_num:)
print "Fetch: ", ch_name, "\n"
verses.each.with_index(1) do |_, verse_num|
htm_file = format(path, ch_name:, verse_num:)
case res = http.request(get_request(htm_file))
when Net::HTTPOK
rows.push([verse_num, extract_verse!(res, verse_num == 1)])
else
print Paint["ERROR (#{res.class}): ", :red, :bold], final_dest, "\n"
break
end
end
File.write(final_dest, JSON.pretty_generate(rows))
print Paint["OK: ", :green, :bold], final_dest.sub(ENV["HOME"], ""), "\n"
sleep cool_off
print Paint["Chill for #{cool_off} seconds", :blue, :bold], "\n", "\n"
end

View file

@ -0,0 +1,116 @@
[
"al-fatiha",
"al-baqara",
"aal-e-imran",
"an-nisa",
"al-maeda",
"al-anaam",
"al-araf",
"al-anfal",
"at-tawba",
"yunus",
"hud",
"yusuf",
"ar-rad",
"ibrahim",
"al-hijr",
"an-nahl",
"al-isra",
"al-kahf",
"maryam",
"ta-ha",
"al-anbiya",
"al-hajj",
"al-mumenoon",
"an-nur",
"al-furqan",
"ash-shuara",
"an-naml",
"al-qasas",
"al-ankabut",
"ar-rum",
"luqman",
"as-sajda",
"al-ahzab",
"saba",
"fatir",
"ya-seen",
"as-saaffat",
"sad",
"az-zamar",
"ghafir",
"fussilat",
"ash-shura",
"az-zukhruf",
"ad-dukhan",
"al-jathiya",
"al-ahqaf",
"muhammad",
"al-fath",
"al-hujraat",
"qaf",
"adh-dhariyat",
"at-tur",
"an-najm",
"al-qamar",
"ar-rahman",
"al-waqia",
"al-hadid",
"al-mujadala",
"al-hashr",
"al-mumtahana",
"as-saff",
"al-jumua",
"al-munafiqoon",
"at-taghabun",
"at-talaq",
"at-tahrim",
"al-mulk",
"al-qalam",
"al-haaqqa",
"al-maarij",
"nooh",
"al-jinn",
"al-muzzammil",
"al-muddaththir",
"al-qiyama",
"al-insan",
"al-mursalat",
"an-naba",
"an-naziat",
"abasa",
"at-takwir",
"al-infitar",
"al-mutaffifin",
"al-inshiqaq",
"al-burooj",
"at-tariq",
"al-ala",
"al-ghashiya",
"al-fajr",
"al-balad",
"ash-shams",
"al-lail",
"ad-dhuha",
"al-sharh",
"at-tin",
"al-alaq",
"al-qadr",
"al-bayyina",
"az-zalzala",
"al-adiyat",
"al-qaria",
"at-takathur",
"al-asr",
"al-humaza",
"al-fil",
"quraish",
"al-maun",
"al-kauther",
"al-kafiroon",
"an-nasr",
"al-masadd",
"al-ikhlas",
"al-falaq",
"an-nas"
]