[devdocsgjs/main: 201/239] Add a scraper for Scala 3




commit afe3a26c7a7388e8d8b44eba1ded347b5a90bfdb
Author: Nicolas Ettlin <nicolas ettlin me com>
Date:   Mon Feb 14 21:58:52 2022 +0100

    Add a scraper for Scala 3

 .../javascripts/templates/pages/about_tmpl.coffee  |   4 +-
 assets/stylesheets/pages/_scala.scss               |  34 ++++
 lib/docs/filters/scala/clean_html_v3.rb            | 181 +++++++++++++++++++++
 lib/docs/filters/scala/entries_v3.rb               | 104 ++++++++++++
 lib/docs/scrapers/scala.rb                         |  32 +++-
 5 files changed, 348 insertions(+), 7 deletions(-)
---
diff --git a/assets/javascripts/templates/pages/about_tmpl.coffee 
b/assets/javascripts/templates/pages/about_tmpl.coffee
index 4eaa4f6d..ec59971d 100644
--- a/assets/javascripts/templates/pages/about_tmpl.coffee
+++ b/assets/javascripts/templates/pages/about_tmpl.coffee
@@ -794,9 +794,9 @@ credits = [
     'https://raw.githubusercontent.com/sass/sass/stable/MIT-LICENSE'
   ], [
     'Scala',
-    '2002-2019 EPFL, with contributions from Lightbend',
+    '2002-2022 EPFL, with contributions from Lightbend',
     'Apache',
-    'https://raw.githubusercontent.com/scala/scala-lang/master/license.md'
+    'https://www.scala-lang.org/license/'
   ], [
     'scikit-image',
     '2019 the scikit-image team',
diff --git a/assets/stylesheets/pages/_scala.scss b/assets/stylesheets/pages/_scala.scss
index b2beb118..6ddb5dc0 100644
--- a/assets/stylesheets/pages/_scala.scss
+++ b/assets/stylesheets/pages/_scala.scss
@@ -1,4 +1,38 @@
 ._scala {
   @extend %simple;
+  
   .deprecated { @extend %label-red; }
+  
+  .related-types {
+    @extend %pre;
+    margin-top: 0;
+    white-space: normal;
+  }
+
+  .links {
+    @extend %box;
+    margin-left: -1rem;
+    text-align: center;
+    padding: .5em;
+
+    a { padding: .4em }
+
+    @include print {
+      display: none;
+    }
+  }
+
+  .source-link {
+    float: right;
+    font-size: .75rem;
+    color: var(--linkColor);
+    cursor: pointer;
+    @extend %user-select-none;
+
+    &:hover { text-decoration: underline; }
+
+    @include print {
+      display: none;
+    }
+  }
 }
diff --git a/lib/docs/filters/scala/clean_html_v3.rb b/lib/docs/filters/scala/clean_html_v3.rb
new file mode 100644
index 00000000..f2d4c793
--- /dev/null
+++ b/lib/docs/filters/scala/clean_html_v3.rb
@@ -0,0 +1,181 @@
+# frozen_string_literal: true
+
+module Docs
+  class Scala
+    class CleanHtmlV3Filter < Filter
+      def call
+        # Remove unneeded elements
+        css('.documentableFilter, .documentableAnchor, .documentableBrief').remove
+
+        format_title
+        format_top_links
+        format_metadata
+        format_members
+
+        # Simplify the HTML structure
+        @doc = at_css('#content > div')
+        css('.documentableList > *').each do |element|
+          element.parent = doc
+        end
+        at_css('.membersList').remove
+
+        doc
+      end
+
+      def format_title
+        # Add the kind of page to the title
+        cover_header = at_css('.cover-header')
+        unless cover_header.nil?
+          icon = cover_header.at_css('.micon')
+          types = {
+            cl: 'Class',
+            ob: 'Object',
+            tr: 'Trait',
+            en: 'Enum',
+            ty: 'Type',
+            pa: 'Package',
+          }
+          type_id = cover_header.at_css('.micon')['class']
+          type_id.remove!('micon ')
+          type_id.remove!('-wc')
+          type = types[type_id.to_sym]
+          name = CGI.escapeHTML cover_header.at_css('h1').text
+
+          package = at_css('.breadcrumbs a:nth-of-type(3)').text
+          package = package + '.' unless name.empty? || package.empty?
+
+          title = root_page? ? 'Package root' : "#{type} #{package}#{name}".strip
+          cover_header.replace "<h1>#{title}</h1>"
+        end
+
+        # Signature
+        signature = at_css('.signature')
+        signature_annotations = signature.at_css('.annotations')
+        signature_annotations.name = 'small' unless signature_annotations.nil?
+        signature.replace "<h2 id=\"signature\">#{signature.inner_html}</h2>"
+      end
+
+      def format_top_links
+        # Companion page
+        links = []
+        at_css('.attributes').css('dt').each do |dt|
+          next if dt.content.strip != 'Companion:'
+          dd = dt.next_sibling
+
+          companion_link = dd.at_css('a')
+          companion_link.content = "Companion #{companion_link.content}"
+          links.append(companion_link.to_html)
+
+          dt.remove
+          dd.remove
+        end
+
+        # Source code
+        at_css('.attributes').css('dt').each do |dt|
+          next if dt.content.strip != 'Source:'
+          dd = dt.next_sibling
+          
+          source_link = dd.at_css('a')
+          source_link.content = 'Source code'
+          links.append(source_link.to_html)
+
+          dt.remove
+          dd.remove
+        end
+
+        # Format the links
+        title = at_css('h1')
+        title.add_next_sibling("<div class=\"links\">#{links.join(' • ')}</div>")
+      end
+
+      def format_metadata
+        # Metadata (attributes)
+        css('.tabs.single .monospace').each do |node|
+          node['class'] = 'related-types'
+
+          if node.children.count > 15
+            node.replace "<details>
+              <summary>#{node.children.count} types</summary>
+              #{node.to_html}
+            </details>"
+          end
+        end
+
+        attributes = at_css('.attributes')
+        attributes.add_previous_sibling('<h3>Metadata</h3>')
+
+        tabs_names = css('.tabs.single .names .tab')
+        tabs_contents = css('.tabs.single .contents .tab')
+        tabs_names.zip(tabs_contents).each do |name, contents|
+          next if name.content == "Graph"
+
+          attributes.add_child("<dt>#{name.content}</dt>")
+          attributes.add_child("<dd>#{contents.inner_html.strip}</dd>")
+        end
+        at_css('.tabs').remove
+      end
+
+      def format_members
+        # Headings
+        css('.cover h2').each do |node|
+          node.name = 'h3'
+        end
+        css('h2:not(#signature)').remove
+        css(
+          '.membersList h3',
+
+          # Custom group headers for which Scaladoc generates invalid HTML
+          '.documentableList > h3:empty + p'
+        ).each do |node|
+          node.name = 'h2'
+          node.content = node.content
+        end
+
+        # Methods
+        css('.documentableElement').each do |element|
+          header = element.at_css('.header')
+          header.name = 'h3'
+
+          id = element['id']
+          element.remove_attribute('id')
+          header['id'] = id
+
+          annotations = element.at_css('.annotations')
+          annotations.name = 'small'
+          header.prepend_child(annotations)
+
+          # View source
+          element.css('dt').each do |dt|
+            next if dt.content.strip != 'Source:'
+            dd = dt.next_sibling
+            
+            source_link = dd.at_css('a')
+            source_link.content = 'Source'
+            source_link['class'] = 'source-link'
+            header.prepend_child(source_link)
+
+            dt.remove
+            dd.remove
+          end
+
+          # Remove the unnecessary wrapper element
+          element.replace(element.inner_html)
+        end
+
+        # Remove deprecated sections
+        css('.documentableList').each do |list|
+          header = list.at_css('.groupHeader')
+          list.remove if (header.text.downcase.include? 'deprecate' rescue false)
+        end
+
+        # Code blocks
+        css('pre > code').each do |code|
+          pre = code.parent
+          pre['data-language'] = 'scala'
+          pre.inner_html = code.inner_html
+        end
+      end
+
+    end
+  end
+end
diff --git a/lib/docs/filters/scala/entries_v3.rb b/lib/docs/filters/scala/entries_v3.rb
new file mode 100644
index 00000000..03a22adb
--- /dev/null
+++ b/lib/docs/filters/scala/entries_v3.rb
@@ -0,0 +1,104 @@
+# frozen_string_literal: true
+
+module Docs
+  class Scala
+    class EntriesV3Filter < Docs::EntriesFilter
+      REPLACEMENTS = {
+        '$eq' => '=',
+        '$colon' => ':',
+        '$less' => '<',
+      }
+
+      def get_name
+        if is_package?
+          at_css('.cover-header h1').text
+        else
+          name = slug.split('/').last
+
+          # Some objects have inner objects, show ParentObject$.ChildObject$ instead of 
ParentObject$$ChildObject$
+          name = name.gsub('$$', '$.')
+
+          REPLACEMENTS.each do |key, value|
+            name = name.gsub(key, value)
+          end
+
+          # If a dollar sign is used as separator between two characters, replace it with a dot
+          name.gsub(/([^$.])\$([^$.])/, '\1.\2')
+        end
+      end
+
+      def get_type
+        # if this entry is for a package, we group the package under the parent package
+        if is_package?
+          parent_package
+        # otherwise, group it under the regular package name
+        else
+          package_name
+        end
+      end
+
+      def include_default_entry?
+        true
+      end
+
+      def additional_entries
+        entries = []
+        titles = []
+
+        css(".documentableElement").each do |node|
+          # Ignore elements without IDs
+          id = node['id']
+          next if id.nil?
+
+          # Ignore deprecated and inherited members
+          next unless node.at_css('.deprecated').nil?
+
+          member_name = node.at_css('.documentableName').content
+          title = "#{name}.#{member_name}"
+          
+          # Add () to methods that take parameters, i.e. methods who have (…)
+          # in their signature, ignoring occurrences of (implicit …) and (using …)
+          signature = node.at_css('.signature').content
+          title += '()' if signature =~ /\((?!implicit)(?!using ).*\)/
+
+          next if titles.include?(title) # Ignore duplicates (function overloading)
+        
+          entries << [title, id]
+          titles.push(title)
+        end
+
+        entries
+      end
+
+      private
+
+      # For the package name, we use the slug rather than parsing the package
+      # name from the HTML because companion object classes may be broken out into
+      # their own entries (by the source documentation). When that happens,
+      # we want to group these classes (like `scala.reflect.api.Annotations.Annotation`)
+      # under the package name, and not the fully-qualfied name which would
+      # include the companion object.
+      def package_name
+        name = package_drop_last(slug_parts)
+        name.empty? ? 'scala' : name
+      end
+
+      def parent_package
+        parent = package_drop_last(package_name.split('.'))
+        parent.empty? ? 'scala' : parent
+      end
+
+      def package_drop_last(parts)
+        parts[0...-1].join('.')
+      end
+
+      def slug_parts
+        slug.split('/')
+      end
+
+      def is_package?
+        !at_css('.cover-header .micon.pa').nil?
+      end
+    end
+  end
+end
diff --git a/lib/docs/scrapers/scala.rb b/lib/docs/scrapers/scala.rb
index 112a696e..39e95121 100644
--- a/lib/docs/scrapers/scala.rb
+++ b/lib/docs/scrapers/scala.rb
@@ -3,22 +3,41 @@ module Docs
     self.name = 'Scala'
     self.type = 'scala'
     self.links = {
-      home: 'http://www.scala-lang.org/',
+      home: 'https://www.scala-lang.org/',
       code: 'https://github.com/scala/scala'
     }
 
-    options[:container] = '#content-container'
     options[:attribution] = <<-HTML
-        &copy; 2002-2019 EPFL, with contributions from Lightbend.<br>
+        &copy; 2002-2022 EPFL, with contributions from Lightbend.<br>
         Licensed under the Apache License, Version 2.0.
     HTML
 
+    # For Scala 3, there is no official download link for the documentation
+    # (see https://contributors.scala-lang.org/t/5537).
+    #
+    # We currently need to build the docs ourselves. To do so:
+    # 1. Make sure that Scala 3 and sbt are installed
+    #    (https://www.scala-lang.org/download/scala3.html)
+    # 2. Clone the Scala 3 (Dotty) repository (https://github.com/lampepfl/dotty)
+    # 3. From the Dotty folder, run this command in the terminal:
+    #    $ sbt scaladoc/generateScalaDocumentation
+    # 4. Extract scaladoc/output/scala3/api/ into docs/scala~3.1
+    version '3.1' do
+      self.release = '3.1.1'
+      self.base_url = 'https://scala-lang.org/api/3.1.1/'
+      self.root_path = 'index.html'
+      # options[:container] = '#main-content'
+
+      html_filters.push 'scala/entries_v3', 'scala/clean_html_v3'
+    end
+
     # https://downloads.lightbend.com/scala/2.13.0/scala-docs-2.13.0.zip
     # Extract api/scala-library into docs/scala~2.13_library
     version '2.13 Library' do
       self.release = '2.13.0'
       self.base_url = 'https://www.scala-lang.org/api/2.13.0/'
       self.root_path = 'index.html'
+      options[:container] = '#content-container'
 
       html_filters.push 'scala/entries_v2', 'scala/clean_html_v2'
     end
@@ -29,6 +48,7 @@ module Docs
       self.release = '2.13.0'
       self.base_url = 'https://www.scala-lang.org/api/2.13.0/scala-reflect/'
       self.root_path = 'index.html'
+      options[:container] = '#content-container'
 
       html_filters.push 'scala/entries_v2', 'scala/clean_html_v2'
     end
@@ -39,6 +59,7 @@ module Docs
       self.release = '2.12.9'
       self.base_url = 'https://www.scala-lang.org/api/2.12.9/'
       self.root_path = 'index.html'
+      options[:container] = '#content-container'
 
       html_filters.push 'scala/entries_v2', 'scala/clean_html_v2'
     end
@@ -49,13 +70,14 @@ module Docs
       self.release = '2.12.9'
       self.base_url = 'https://www.scala-lang.org/api/2.12.9/scala-reflect/'
       self.root_path = 'index.html'
+      options[:container] = '#content-container'
 
       html_filters.push 'scala/entries_v2', 'scala/clean_html_v2'
     end
 
     def get_latest_version(opts)
-      doc = fetch_doc('https://www.scala-lang.org/api/current/', opts)
-      doc.at_css('#doc-version').content
+      doc = fetch_doc('https://www.scala-lang.org/api/3.x/', opts)
+      doc.at_css('.projectVersion').content
     end
   end
 end


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]