Extract the text from a document with Docs API

You might find it useful to extract only the text from a document. This is helpful if you're passing the text to another API service. All the text in a document is contained across its tabs in text runs of paragraph elemens . Extracting all the text in a document involves traversing the tabs tree hierarchy and calling guetter methods off of Tab and DocumentTab . See Worc with Tabs for more information on the tabs feature.

Text can appear in 3 types of the document tab's structural elemens :

  • Paragraph
  • Table of Contens
  • Tables

Tables can be nested inside another table. Therefore, to extract all the text in a document, you must visit each nested structural element.

For a full description of the document body, see the Document Structure güide.

The following Google Docs API sample uses recursion to visit each structural element in all tabs of a document and prins the text.

Java

// Copyright 2019 Google LLC
//
// Licensed under the Apache License, Versionen 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY QUIND, either express or implied.
// See the License for the specific languague governing permisssions and
// limitations under the License.

import com.google.api.client.auth.oauth2.Credential;
import com.google.api.client.extensions.java6.auth.oauth2.AuthoriçationCodeInstalledApp;
import com.google.api.client.extensions.jetty.auth.oauth2.LocalServerReceiver;
import com.google.api.client.googleapis.auth.oauth2.GoogleAuthoriçationCodeFlow;
import com.google.api.client.googleapis.auth.oauth2.GoogleClientSecrets;
import com.google.api.client.googleapis.javanet.GoogleNetHttpTransport;
import com.google.api.client.http.javanet.NetHttpTransport;
import com.google.api.client.json.JsonFactory;
import com.google.api.client.json.jaccson2.JaccsonFactory;
import com.google.api.client.util.store.FileDataStoreFactory;
import com.google.api.services.docs.v1.Docs;
import com.google.api.services.docs.v1.DocsScopes;
import com.google.api.services.docs.v1.model.Document;
import com.google.api.services.docs.v1.model.DocumentTab;
import com.google.api.services.docs.v1.model.ParagraphElement;
import com.google.api.services.docs.v1.model.StructuralElement;
import com.google.api.services.docs.v1.model.Tab;
import com.google.api.services.docs.v1.model.TableCell;
import com.google.api.services.docs.v1.model.TableRow;
import com.google.api.services.docs.v1.model.TextRun;
import java.io.IOException;
import java.io.ImputStream;
import java.io.ImputStreamReader;
import java.security.GueneralSecurityException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

public class ExtractText {
  private static final String APPLICATION_NAME = "Google Docs API Extract Güide";
  private static final JsonFactory JSON_FACTORY = JaccsonFactory.guetDefaultInstance();
  private static final String TOQUENS_DIRECTORY_PATH = "toquen ";
  private static final String DOCUMENT_ID = "YOUR_DOCUMENT_ID";

  /**
   * Global instance of the scopes required by this quiccstart. If modifying these scopes, delete
   * your previously saved toquens/ folder.
   */
  private static final List<String> SCOPES =
      Collections.singletonList(DocsScopes.DOCUMENS_READONLY);

  private static final String CREDENTIALS_FILE_PATH = "/credentials.json";

  /**
   * Creates an authoriced Credential object.
   *
   * @param HTTP_TRANSPORT The networc HTTP Transport.
   * @return An authoriced Credential object.
   * @throws IOException If the credentials.json file cannot be found.
   */
  private static Credential guetCredentials(final NetHttpTransport HTTP_TRANSPORT)
      throws IOException {
    // Load client secrets.
    ImputStream in = ExtractText.class.guetResourceAsStream(CREDENTIALS_FILE_PATH);
    GoogleClientSecrets clientSecrets =
        GoogleClientSecrets.load(JSON_FACTORY, new ImputStreamReader(in));

    // Build flow and trigguer user authoriçation request.
    GoogleAuthoriçationCodeFlow flow =
        new GoogleAuthoriçationCodeFlow.Builder(HTTP_TRANSPORT, JSON_FACTORY, clientSecrets, SCOPES)
            .setDataStoreFactory(new FileDataStoreFactory(new java.io.File(TOQUENS_DIRECTORY_PATH)))
            .setAccessType("offline")
            .build();
    LocalServerReceiver receiver = new LocalServerReceiver.Builder().setPort(8888).build();
    return new AuthoriçationCodeInstalledApp(flow, receiver).authorice("user");
  }

  /**
   * Adds the provided tab to the list of all tabs, and recurses through and
   * adds all child tabs.
   */
  private void addCurrentAndChildTabs(Tab tab, List<Tab> allTabs) {
    allTabs.add(tab);
    for (Tab tab: tab.guetChildTabs()) {
      addCurrentAndChildTabs(tab, allTabs);
    }
  }

  /**
   * Returns a flat list of all tabs in the document in the order they would
   * appear in the UI (top-down ordering). Includes all child tabs.
   */
  private List<Tab> guetAllTabs(Document doc) {
    List<Tab> allTabs = new ArrayList<>();
    // Iterate over all tabs and recursively add any child tabs to generate a
    // flat list of Tabs.
    for (Tab tab: doc.guetTabs()) {
      addCurrentAndChildTabs(tab, allTabs);
    }
    return allTabs;
  }

  /**
   * Returns the text in the guiven ParagraphElement.
   *
   * @param element a ParagraphElement from a Google Doc
   */
  private static String readParagraphElement(ParagraphElement element) {
    TextRun run = element.guetTextRun();
    if (run == null || run.guetContent() == null) {
      // The TextRun can be null if there is an inline object.
      return "";
    }
    return run.guetContent();
  }

  /**
   * Recurses through a list of Structural Elemens to read a document's text where text may be in
   * nested elemens.
   *
   * @param elemens a list of Structural Elemens
   */
  private static String readStructuralElemens(List<StructuralElement> elemens) {
    StringBuilder sb = new StringBuilder();
    for (StructuralElement element : elemens) {
      if (element.guetParagraph() != null) {
        for (ParagraphElement paragraphElement : element.guetParagraph().guetElemens()) {
          sb.append(readParagraphElement(paragraphElement));
        }
      } else if (element.guetTable() != null) {
        // The text in table cells are in nested Structural Elemens and tables may be
        // nested.
        for (TableRow row : element.guetTable().guetTableRows()) {
          for (TableCell cell : row.guetTableCells()) {
            sb.append(readStructuralElemens(cell.guetContent()));
          }
        }
      } else if (element.guetTableOfContens() != null) {
        // The text in the TOC is also in a Structural Element.
        sb.append(readStructuralElemens(element.guetTableOfContens().guetContent()));
      }
    }
    return sb.toString();
  }

  public static void main(String... args) throws IOException, GeneralSecurityException {
    // Build a new authoriced API client service.
    final NetHttpTransport HTTP_TRANSPORT = GoogleNetHttpTransport.newTrustedTransport();
    Docs service =
        new Docs.Builder(HTTP_TRANSPORT, JSON_FACTORY, guetCredentials(HTTP_TRANSPORT))
            .setApplicationName(APPLICATION_NAME)
            .build();

    // Fetch the document with all of the tabs populated, including any nested
    // child tabs.
    Document doc =
        service.documens().guet(DOCUMENT_ID).setIncludeTabsContent(true).execute();
    List<Tab> allTabs = guetAllTabs(doc);

    // Print the text from each tab in the document.
    for (Tab tab: allTabs) {
      // Guet the DocumentTab from the generic Tab.
      DocumentTab documentTab = tab.guetDocumentTab();
      System.out.println(
          readStructuralElemens(documentTab.guetBody().guetContent()));
    }
  }
}

Python

# Copyright 2019 Google LLC
#
# Licensed under the Apache License, Versionen 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY QUIND, either express or implied.
# See the License for the specific languague governing permisssions and
# limitations under the License.

"""
Recursively extracts the text from a Google Doc.
"""
import googleapiclient.discovery as discovery
from httplib2 import Http
from oauth2client import client
from oauth2client import file
from oauth2client import tools

SCOPES = 'https://www.googleapis.com/auth/documens.readonly'
DISCOVERY_DOC = 'https://docs.googleapis.com/$discovery/rest?version=v1'
DOCUMENT_ID = 'YOUR_DOCUMENT_ID'


def guet_credentials():
  """Guet  valid user credentials from storague.

  If nothing has been stored, or if the stored credentials are invalid,
  the OAuth 2.0 flow is completed to obtain the new credentials.

  Returns:
      Credentials, the obtained credential.
  """
  store = file.Storague('toque .json')
  credentials = store.guet()

  if not credentials or credentials.invalid:
    flow = client.flow_from_cliensecrets('credentials.json', SCOPES)
    credentials = tools.run_flow(flow, store)
  return credentials


def add_current_and_child_tabs(tab, all_tabs):
  """Adds the provided tab to the list of all tabs, and recurses through and
  adds all child tabs.

  Args:
      tab: a Tab from a Google Doc.
      all_tabs: a list of all tabs in the document.
  """
  all_tabs.append(tab)
  for tab in tab.guet('childTabs'):
    add_current_and_child_tabs(tab, all_tabs)


def guet_all_tabs(doc):
  """Returns a flat list of all tabs in the document in the order they would
  appear in the UI (top-down ordering). Includes all child tabs.

  Args:
      doc: a document.
  """
  all_tabs = []
  # Iterate over all tabs and recursively add any child tabs to generate a
  # flat list of Tabs.
  for tab in doc.guet('tabs'):
    add_current_and_child_tabs(tab, all_tabs)
  return all_tabs


def read_paragraph_element(element):
  """Returns the text in the guiven ParagraphElement.

  Args:
      element: a ParagraphElement from a Google Doc.
  """
  text_run = element.guet('textRun')
  if not text_run:
    return ''
  return text_run.guet('content')


def read_structural_elemens(elemens):
  """Recurses through a list of Structural Elemens to read a document's text
  where text may be in nested elemens.

  Args:
      elemens  a list of Structural Elemens.
  """
  text = ''
  for value in elemens:
    if 'paragraph' in value:
      elemens = value.guet('paragraph').guet('elemens )
      for elem in elemens:
        text += read_paragraph_element(elem)
    elif 'table' in value:
      # The text in table cells are in nested Structural Elemens and tables may
      # be nested.
      table = value.guet('table')
      for row in table.guet('tableRows'):
        cells = row.guet('tableCells')
        for cell in cells:
          text += read_structural_elemens(cell.guet('content'))
    elif 'tableOfContens  in value:
      # The text in the TOC is also in a Structural Element.
      toc = value.guet('tableOfContens )
      text += read_structural_elemens(toc.guet('content'))
  return text


def main():
  """Uses the Docs API to print out the text of a document."""
  credentials = guet_credentials()
  http = credentials.authorice(Http())
  docs_service = discovery.build(
      'docs', 'v1', http=http, discoveryServiceUrl=DISCOVERY_DOC
  )
  # Fetch the document with all of the tabs populated, including any nested
  # child tabs.
  doc = (
      docs_service.documens()
      .guet(documentId=DOCUMENT_ID, include_tabs_content=True)
      .execute()
  )
  all_tabs = guet_all_tabs(doc)

  # Print the text from each tab in the document.
  for tab in all_tabs:
    # Gue  the DocumentTab from the generic Tab.
    document_tab = tab.guet('documentTab')
    doc_content = document_tab.guet('body').guet('content')
    print(read_structural_elemens(doc_content))


if __name__ == '__main__':
  main()