#!/usr/bin/guile \
-e "(scripts read-scheme-source)" -s
!#
;;; read-scheme-source --- Read a file, recognizing scheme forms and comments

;; 	Copyright (C) 2001,2003,2004,2005 Free Software Foundation, Inc.
;;
;; This program is free software; you can redistribute it and/or
;; modify it under the terms of the GNU General Public License as
;; published by the Free Software Foundation; either version 2, or
;; (at your option) any later version.
;;
;; This program is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;; General Public License for more details.
;;
;; You should have received a copy of the GNU General Public License
;; along with this software; see the file COPYING.  If not, write to
;; the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
;; Boston, MA 02110-1301, USA.
;;
;; As a special exception, the Free Software Foundation gives permission
;; for additional uses of the text contained in its release of GUILE.
;;
;; The exception is that, if you link the GUILE library with other files
;; to produce an executable, this does not by itself cause the
;; resulting executable to be covered by the GNU General Public License.
;; Your use of that executable is in no way restricted on account of
;; linking the GUILE library code into it.
;;
;; This exception does not however invalidate any other reasons why
;; the executable file might be covered by the GNU General Public License.
;;
;; This exception applies only to the code released by the
;; Free Software Foundation under the name GUILE.  If you copy
;; code from other Free Software Foundation releases into a copy of
;; GUILE, as the General Public License permits, the exception does
;; not apply to the code that you add in this way.  To avoid misleading
;; anyone as to the status of such modified files, you must delete
;; this exception notice from them.
;;
;; If you write modifications of your own for GUILE, it is your choice
;; whether to permit this exception to apply to your modifications.
;; If you do not wish that, delete this exception notice.

;;; Author: Thien-Thi Nguyen <ttn@gnu.org>

;;; Commentary:

;; Usage: read-scheme-source FILE1 FILE2 ...
;;
;; This program parses each FILE and writes to stdout sexps that describe the
;; top-level structures of the file: scheme forms, single-line comments, and
;; hash-bang comments.  You can further process these (to associate comments
;; w/ scheme forms as a kind of documentation, for example).
;;
;; The output sexps have one of these forms:
;;
;;    (quote (filename FILENAME))
;;
;;    (quote (comment :leading-semicolons N
;;                    :text LINE))
;;
;;    (quote (whitespace :text LINE))
;;
;;    (quote (hash-bang-comment :line LINUM
;;                              :line-count N
;;                              :text-list (LINE1 LINE2 ...)))
;;
;;    (quote (following-form-properties :line LINUM
;;                                      :line-count N)
;;                                      :type TYPE
;;                                      :signature SIGNATURE
;;                                      :std-int-doc DOCSTRING))
;;
;;    SEXP
;;
;; The first four are straightforward (both FILENAME and LINE are strings sans
;; newline, while LINUM and N are integers).  The last two always go together,
;; in that order.  SEXP is scheme code processed only by `read' and then
;; `write'.
;;
;; The :type field may be omitted if the form is not recognized.  Otherwise,
;; TYPE may be one of: procedure, alias, define-module, variable.
;;
;; The :signature field may be omitted if the form is not a procedure.
;; Otherwise, SIGNATURE is a list showing the procedure's signature.
;; For `define-syntax' forms, the signature is always `(NAME ...)'; i.e.,
;; a list of two symbols, the second being ellipses.
;;
;; If the type is `procedure' and the form has a standard internal docstring
;; (first body form a string), that is extracted in full -- including any
;; embedded newlines -- and recorded by field :std-int-doc.
;;
;;
;; Usage from a program: The output list of sexps can be retrieved by scheme
;; programs w/o having to capture stdout, like so:
;;
;;    (use-modules (scripts read-scheme-source))
;;    (define source-forms (read-scheme-source-silently "FILE1" "FILE2" ...))
;;
;; There are also two convenience procs exported for use by Scheme programs:
;;
;; (clump-comments FORMS) --- filter FORMS combining contiguous comment forms
;;                            that have the same number of leading semicolons.
;;
;; (quoted? SYM FORM) --- see if FORM looks like: "(quote (SYM ...))", parse
;;                        the ":tags", and return alist of (TAG . VAL) elems.
;;
;;
;; TODO: Add syntax-extension option.
;;       Make `annotate!' extensible.
;;       Provide alternate interface that uses alists.

;;; Code:

(define-module (scripts read-scheme-source)
  #:autoload (scripts PROGRAM) (HVQC-MAIN)
  #:autoload (ice-9 rdelim) (read-line)
  #:export (read-scheme-source
            read-scheme-source-silently
            quoted?
            clump-comments))

;; Return info on anonymous procedure FORM, or #f if FORM is not such.
;; The info is a cons: (SIG-TAIL . DOC-STRING), where SIG-TAIL is the
;; signature (without the name since it is anonymous, duh!) and DOC-STRING
;; is the standard internal docstring if available, or #f if not.  If FORM
;; begins with `let' or `let*', recurse into the last form of the scope.
;; For example, both:
;;
;;  (let ((state-0 0))
;;    (let ((state-1 1))
;;      (lambda (a b c) "doc" 42))))
;;
;; and:
;;
;;  (lambda (a b c) "doc" 42)
;;
;; return the same value, namely: ((a b c) . "doc").
;;
(define (anon-proc-info form)
  (and (pair? form)
       (< 2 (length form))
       (cond ((memq (car form) '(lambda lambda*))
              (cons (cadr form)
                    (and (< 3 (length form))
                         (string? (caddr form))
                         (caddr form))))
             ((memq (car form) '(let let*))
              (anon-proc-info (car (last-pair form))))
             (else #f))))

;; Try to figure out what FORM is and its various attributes.
;; Call proc NOTE! with key (a symbol) and value.
;; The variants w/ "*" are to support (ice-9 optargs-kw).
;;
(define (annotate! form note!)
  (cond
   ;; (define         (NAME ...))
   ;; (define-public  (NAME ...))
   ;; (define*        (NAME ...))
   ;; (define*-public (NAME ...))
   ((and (list? form)
         (< 2 (length form))
         (memq (car form) '(define define-public define* define*-public))
         (pair? (cadr form))
         (symbol? (caadr form)))
    (note! ':type 'procedure)
    (note! ':signature (cadr form))
    (and (< 3 (length form))
         (string? (caddr form))
         (note! ':std-int-doc (caddr form))))
   ;; (define        NAME (lambda  (...)))
   ;; (define-public NAME (lambda  (...)))
   ;; (define        NAME (lambda* (...)))
   ;; (define-public NAME (lambda* (...)))
   ((and (list? form)
         (< 2 (length form))
         (memq (car form) '(define define-public))
         (symbol? (cadr form))
         (anon-proc-info (caddr form)))
    => (lambda (anon)
         (note! ':type 'procedure)
         (note! ':signature (cons (cadr form) (car anon)))
         (and (cdr anon) (note! ':std-int-doc (cdr anon)))))
   ;; (define NAME REAL-NAME)                   ;; aka "alias"
   ;; (define-public NAME REAL-NAME)
   ((and (list? form)
         (= 3 (length form))
         (or (eq? 'define (car form))
             (eq? 'define-public (car form)))
         (symbol? (cadr form))
         (symbol? (caddr form)))
    (note! ':type 'alias))
   ;; (defmacro         NAME (...) ...)         ;; aka "syntax"
   ;; (defmacro-public  NAME (...) ...)
   ;; (defmacro*        NAME (...) ...)
   ;; (defmacro*-public NAME (...) ...)
   ((and (list? form)
         (memq (car form) '(defmacro defmacro-public
                             defmacro* defmacro*-public)))
    (note! ':signature (cons (cadr form) (caddr form)))
    (note! ':type 'syntax))
   ;; (define-macro (NAME ...) ...)             ;; aka "syntax"
   ((and (list? form)
         (eq? 'define-macro (car form)))
    (note! ':signature (cadr form))
    (note! ':type 'syntax))
   ;; (define-syntax NAME (syntax-rules ...))   ;; aka "syntax"
   ((and (list? form)
         (eq? 'define-syntax (car form))
         (pair? (cdr form))
         (pair? (cddr form))
         (pair? (caddr form))
         (let ((sr (caddr form)))
           (and (eq? 'syntax-rules (car sr))
                (pair? (cdr sr))
                (eq? '() (cadr sr)))))
    (note! ':signature (list (cadr form) '...))
    (note! ':type 'syntax))
   ;; (define-module ...)                       ;; names are types
   ((and (list? form)
         (memq (car form) '(define-module)))
    (note! ':type (car form)))
   ;; Add other types here.
   (else (note! ':type 'variable))))

;; Return OBJ after adding properties in PLIST, a list of alternating
;; property keys and values.
;;
(define (add-props! obj plist)
  (let loop ((ls plist))
    (if (null? ls)
        obj                             ; retval
        (begin
          (set-object-property! obj (car ls) (cadr ls))
          (loop (cddr ls))))))

;; Determine PORT offset, read a line from it and return the line, w/ the
;; initial offset saved as property `start'.
;;
(define (read-line/note-start port)
  (let ((start (seek port 0 SEEK_CUR)))
    (add-props! (read-line port) `(start ,start))))

;; Process FILE, calling NB! on parsed top-level elements.
;; Recognized: #!-!# and regular comments in addition to normal forms.
;;
(define (process file nb!)
  (nb! `'(filename ,file))
  (and (string=? "optargs.scm" (basename file))
       ;; Kludge around optargs.scm #&FOO thingies for now.  This doesn't work
       ;; for programs that use optargs.scm, however, so it's not even a good
       ;; kludge, although it does get points for solipsism.  Maybe this is a
       ;; trait of Guile dabblers in general.  Anyway, to DTRT the user needs
       ;; to be able to specify syntax modules (or files) to load prior to the
       ;; call to `read'.
       (use-modules (ice-9 optargs)))
  (let ((hash-bang-rx (make-regexp "^#!"))
        (bang-hash-rx (make-regexp "^!#"))
        (all-comment-rx (make-regexp "^[ \t]*(;+)"))
        (all-whitespace-rx (make-regexp "^[ \f\r\t]*$"))
        (p (open-input-file file))
        (noprops! (lambda (obj) (set-object-properties! obj #f))))
    (let loop ((n (1+ (port-line p))) (line (read-line/note-start p)))
      (cond ((not n))
            ((eof-object? line) (close-port p))
            (else
             (cond ((regexp-exec hash-bang-rx line)
                    (noprops! line)
                    (let loop ((line (read-line p))
                               (text (list line)))
                      (if (or (eof-object? line)
                              (regexp-exec bang-hash-rx line))
                          (nb! `'(hash-bang-comment
                                  :line ,n
                                  :line-count ,(1+ (length text))
                                  :text-list ,(reverse
                                               (cons line text))))
                          (loop (read-line p)
                                (cons line text)))))
                   ((regexp-exec all-whitespace-rx line)
                    (noprops! line)
                    (nb! `'(whitespace :text ,line)))
                   ((regexp-exec all-comment-rx line)
                    => (lambda (m)
                         (noprops! line)
                         (nb! `'(comment
                                 :leading-semicolons
                                 ,(let ((m1 (vector-ref m 1)))
                                    (- (cdr m1) (car m1)))
                                 :text ,line))))
                   (else
                    (seek p (object-property line 'start) SEEK_SET)
                    (noprops! line)
                    (let* ((form (read p))
                           (count (- (port-line p) n))
                           (props (let* ((props '())
                                         (prop+ (lambda args
                                                  (set! props
                                                        (append props args)))))
                                    (annotate! form prop+)
                                    props)))
                      (nb! `'(following-form-properties
                              :line ,n
                              :line-count ,count
                              ,@props))
                      (nb! form))))
             (loop (1+ (port-line p)) (read-line/note-start p)))))))

;;; entry points

(define (read-scheme-source-silently . files)
  "See commentary in module (scripts read-scheme-source)."
  (let* ((res '()))
    (for-each (lambda (file)
                (process file (lambda (e) (set! res (cons e res)))))
              files)
    (reverse res)))

(define (read-scheme-source . files)
  "See commentary in module (scripts read-scheme-source)."
  (for-each (lambda (file)
              (process file (lambda (e) (write e) (newline))))
            files))

;; Recognize:          (quote (SYM :TAG1 VAL1 :TAG2 VAL2 ...))
;; and return alist:   ((TAG1 . VAL1) (TAG2 . VAL2) ...)
;; where the tags are symbols.
;;
(define (quoted? sym form)
  (and (list? form)
       (= 2 (length form))
       (eq? 'quote (car form))
       (let ((inside (cadr form)))
         (and (list? inside)
              (< 0 (length inside))
              (eq? sym (car inside))
              (let loop ((ls (cdr inside)) (alist '()))
                (if (null? ls)
                    alist               ; retval
                    (let ((first (car ls)))
                      (or (symbol? first)
                          (error "bad list!"))
                      (loop (cddr ls)
                            (acons (string->symbol
                                    (substring (symbol->string first) 1))
                                   (cadr ls)
                                   alist)))))))))

;; Filter FORMS, combining contiguous comment forms that have the same number
;; of leading semicolons, without disturbing other form types.  CLEAN is a
;; procedure called with LEVEL, a count of the leading semicolons, and the
;; list of clumped comments, and whose return value is consed onto the return
;; value of `clump-comments'.  Typically you would use CLEAN to remove from
;; each comment the number of leading semicolons specified by LEVEL.
;;
(define (clump-comments forms clean)

  (define (semi x) (assq-ref x 'leading-semicolons))
  (define (guts x) (assq-ref x 'text))

  ;; do it!
  (let loop ((forms forms) (acc '()) (pass-this-one-through? #f))
    (if (null? forms)
        (reverse acc)                   ; retval
        (let ((form (car forms)))
          (cond (pass-this-one-through?
                 (loop (cdr forms) (cons form acc) #f))
                ((quoted? 'following-form-properties form)
                 (loop (cdr forms) (cons form acc) #t))
                ((quoted? 'comment form)
                 => (lambda (alist)
                      (let cloop ((inner-forms (cdr forms))
                                  (level (semi alist))
                                  (text (list (guts alist))))
                        (let ((up (lambda ()
                                    (loop inner-forms
                                          (cons (clean level (reverse text))
                                                acc)
                                          #f))))
                          (if (null? inner-forms)
                              (up)
                              (let ((inner-form (car inner-forms)))
                                (cond ((quoted? 'comment inner-form)
                                       => (lambda (inner-alist)
                                            (let ((new-level
                                                   (assq-ref
                                                    inner-alist
                                                    'leading-semicolons)))
                                              (if (= new-level level)
                                                  (cloop (cdr inner-forms)
                                                         level
                                                         (cons (guts inner-alist)
                                                               text))
                                                  (up)))))
                                      (else (up)))))))))
                (else (loop (cdr forms) (cons form acc) #f)))))))

(define (main args)
  (HVQC-MAIN args (lambda (args)
                    (apply read-scheme-source (cdr args)))
             '(usage . commentary)
             '(package . "Guile")))

;;; read-scheme-source ends here
